merge python scripts
This commit is contained in:
@@ -0,0 +1,233 @@
|
||||
from common.commons import *
|
||||
DATA_PATH = os.environ["DATA_PATH"]
|
||||
PROJECT_TYPE = os.environ["PROJECT_TYPE"]
|
||||
|
||||
|
||||
def bStats():
|
||||
if isfile(join(DATA_PATH, 'studyBugReports.pickle')):
|
||||
studyBugReports = load_zipped_pickle(join(DATA_PATH, 'studyBugReports.pickle'))
|
||||
else:
|
||||
brs = load_zipped_pickle(join(DATA_PATH, args.subject + "bugReportsComplete.pickle"))
|
||||
commits = load_zipped_pickle(join(DATA_PATH, 'singleBR.pickle'))
|
||||
|
||||
dbDir = join(DATA_PATH, 'redis')
|
||||
|
||||
portInner = '6399'
|
||||
startDB(dbDir, portInner, PROJECT_TYPE)
|
||||
|
||||
import redis
|
||||
|
||||
redis_db = redis.StrictRedis(host="localhost", port=portInner, db=0)
|
||||
keys = redis_db.scan(0, match='*', count='1000000')
|
||||
|
||||
matches = pd.DataFrame(keys[1], columns=['pairs_key'])
|
||||
|
||||
# matches = load_zipped_pickle(join(DATA_PATH,'singleHunks'))
|
||||
matches['pairs_key'] = matches['pairs_key'].apply(lambda x: x.decode())
|
||||
matches['root'] = matches['pairs_key'].apply(lambda x: x.split('/')[0])
|
||||
matches['size'] = matches['pairs_key'].apply(lambda x: x.split('/')[1])
|
||||
matches['file'] = matches['pairs_key'].apply(lambda x: x.split('/')[2])
|
||||
matches['repo'] = matches['file'].apply(lambda x: x.split('_')[0])
|
||||
matches['commit'] = matches['file'].apply(lambda x: x.split('_')[1])
|
||||
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
|
||||
def getBID(x):
|
||||
try:
|
||||
if x['repo'].endswith('.git'):
|
||||
return None
|
||||
subject = subjects.query("Repo == '{0}'".format(x['repo'])).Subject.tolist()[0]
|
||||
bids = commits.query(
|
||||
"commit.str.startswith('{0}') and project== '{1}'".format(x['commit'], subject)).bid.tolist()
|
||||
return bids[0]
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
matches = matches[~matches.repo.apply(
|
||||
lambda i: (i.startswith('commons-math') or i.startswith('commons-lang') or i.startswith(
|
||||
'closure-compiler') or i.startswith('joda-time') or i.startswith('mockito') or i.startswith(
|
||||
'jfreechart')))]
|
||||
matches['bid'] = matches.apply(lambda x: getBID(x), axis=1)
|
||||
|
||||
subjects
|
||||
# res = pd.merge(matches, brs, on=['bid'])
|
||||
save_zipped_pickle(matches, join(DATA_PATH, 'studyDataset.pickle'))
|
||||
studyBugReports = brs[brs.bid.isin(matches.bid.unique())]
|
||||
save_zipped_pickle(studyBugReports, join(DATA_PATH, 'studyBugReports.pickle'))
|
||||
if isfile(join(DATA_PATH, 'studyBR_DTM_index')):
|
||||
brIndexes = load_zipped_pickle(join(DATA_PATH, 'studyBR_DTM_index'))
|
||||
bugDTM = load_zipped_pickle(join(DATA_PATH, 'studyBR_DTM'))
|
||||
vectorDF = load_zipped_pickle(join(DATA_PATH, 'studyBR_vector'))
|
||||
matches = load_zipped_pickle(join(DATA_PATH, 'studyDataset.pickle'))
|
||||
else:
|
||||
studyBugReports['description'] = studyBugReports['description'].fillna("")
|
||||
studyBugReports['sumDesc'] = studyBugReports['summary'] + studyBugReports['description']
|
||||
# corpus['sumDesc'] = corpus['summary'] + corpus['desc']
|
||||
# from common.preprocessing import
|
||||
# result, aVector = getVectorAndDtm(corpus, 'summary')
|
||||
from common.preprocessing import calculateTfIdfNLList
|
||||
|
||||
corpusBug = studyBugReports['sumDesc'].values.tolist()
|
||||
from common.preprocessing import preprocessingNL
|
||||
|
||||
preCorpusBug = list(map(preprocessingNL, corpusBug))
|
||||
|
||||
v = calculateTfIdfNLList(preCorpusBug)
|
||||
bugDTM = v.transform(preCorpusBug)
|
||||
bugDTM
|
||||
save_zipped_pickle(bugDTM, join(DATA_PATH, 'studyBR_DTM'))
|
||||
brIndexes = studyBugReports['bid'].values.tolist()
|
||||
|
||||
save_zipped_pickle(brIndexes, join(DATA_PATH, 'studyBR_DTM_index'))
|
||||
# from sklearn.metrics.pairwise import cosine_similarity
|
||||
# cosine_similarity(bugDTM[11701], bugDTM[11111])
|
||||
vectorDF = pd.DataFrame(columns=['bid', 'dtm'])
|
||||
# idx = 0
|
||||
for idx, val in enumerate(brIndexes):
|
||||
vectorDF.loc[idx] = [val, bugDTM[idx]]
|
||||
vectorDF
|
||||
|
||||
save_zipped_pickle(vectorDF, join(DATA_PATH, 'studyBR_vector'))
|
||||
|
||||
matches
|
||||
if isfile(join(DATA_PATH, 'study_clusters')):
|
||||
clustersDF = load_zipped_pickle(join(DATA_PATH, 'study_clusters'))
|
||||
else:
|
||||
clustersDF = pd.DataFrame(columns=['cid', 'type', 'members'])
|
||||
idx = 0
|
||||
|
||||
def statsCore(cs, type):
|
||||
global idx
|
||||
|
||||
cs = [i for i in cs if not (i.startswith('commons-math') or i.startswith('commons-lang') or i.startswith(
|
||||
'closure-compiler') or i.startswith('joda-time') or i.startswith('mockito') or i.startswith(
|
||||
'jfreechart'))]
|
||||
# print('Cluster %s : member size %s' % (shape+"-"+size +"-"+cluster, len(cs)))
|
||||
if len(cs) > 0:
|
||||
if token is None:
|
||||
if action is None:
|
||||
t = shape + "-" + size + "-" + cluster
|
||||
|
||||
clustersDF.loc[idx] = [t, type, cs]
|
||||
idx = idx + 1
|
||||
else:
|
||||
t = shape + "-" + size + "-" + cluster + "-" + action # , len(cs)
|
||||
clustersDF.loc[idx] = [t, type, cs]
|
||||
idx = idx + 1
|
||||
else:
|
||||
# clusterSize = len(cs)
|
||||
# if clusterSize > 0:
|
||||
# clusterSize = len(set([re.split('.txt_[0-9]+', i)[0] for i in cs]))
|
||||
t = shape + "-" + size + "-" + cluster + "-" + action + "-" + token # , clusterSize
|
||||
clustersDF.loc[idx] = [t, type, cs]
|
||||
idx = idx + 1
|
||||
|
||||
for type in ['tokens', 'actions', 'shapes']:
|
||||
shapesPath = join(DATA_PATH, type)
|
||||
shapes = listdir(shapesPath)
|
||||
shapes = [f for f in shapes if isdir(join(shapesPath, f))]
|
||||
shape = size = cluster = action = token = None
|
||||
|
||||
for shape in shapes:
|
||||
if shape.startswith('.'):
|
||||
continue
|
||||
sizes = listdir(join(shapesPath, shape))
|
||||
|
||||
for size in sizes:
|
||||
if size.startswith('.'):
|
||||
continue
|
||||
clusters = listdir(join(shapesPath, shape, size))
|
||||
for cluster in clusters:
|
||||
if cluster.startswith('.'):
|
||||
continue
|
||||
cs = listdir(join(shapesPath, shape, size, cluster))
|
||||
|
||||
if shapesPath.endswith('shapes'):
|
||||
cs = listdir(join(shapesPath, shape, size, cluster))
|
||||
statsCore(cs, 'shapes')
|
||||
else:
|
||||
# level3
|
||||
for action in cs:
|
||||
if action.startswith('.'):
|
||||
continue
|
||||
tokens = listdir(join(shapesPath, shape, size, cluster, action))
|
||||
if shapesPath.endswith('actions'):
|
||||
statsCore(tokens, 'actions')
|
||||
else:
|
||||
for token in tokens:
|
||||
if token.startswith('.'):
|
||||
continue
|
||||
cs = listdir(join(shapesPath, shape, size, cluster, action, token))
|
||||
statsCore(cs, 'tokens')
|
||||
|
||||
clustersDF
|
||||
save_zipped_pickle(clustersDF, join(DATA_PATH, 'study_clusters'))
|
||||
clustersDF
|
||||
|
||||
# selected = clustersDF[clustersDF.type =='shapes']
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
# cosine_similarity(bugDTM[11701], bugDTM[11111])
|
||||
def getSimilarity(x):
|
||||
try:
|
||||
if len(x) == 1:
|
||||
return [1]
|
||||
else:
|
||||
filenames = list(set([re.split('.txt_[0-9]+', i)[0] for i in x]))
|
||||
if len(filenames) == 1:
|
||||
return [1]
|
||||
else:
|
||||
bids2Compare = [matches[matches.file.str.startswith(fn)].bid.unique()[0] for fn in filenames]
|
||||
|
||||
pairs = list(itertools.combinations(bids2Compare, 2))
|
||||
pairs
|
||||
res = []
|
||||
for p in pairs:
|
||||
p
|
||||
simi = cosine_similarity(vectorDF[vectorDF.bid == p[0]].iloc[0].dtm,
|
||||
vectorDF[vectorDF.bid == p[1]].iloc[0].dtm)
|
||||
res.append(simi[0][0])
|
||||
return res
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
# import swifter
|
||||
clustersDF['simi'] = clustersDF.members.apply(lambda x: getSimilarity(x))
|
||||
save_zipped_pickle(clustersDF, join(DATA_PATH, 'study_clusters'))
|
||||
|
||||
clustersDF
|
||||
|
||||
shapes = clustersDF[clustersDF.type == 'shapes']
|
||||
actions = clustersDF[clustersDF.type == 'actions']
|
||||
tokens = clustersDF[clustersDF.type == 'tokens']
|
||||
|
||||
# shapes
|
||||
# yList = [list(itertools.chain.from_iterable(shapes.simi.values.tolist())),
|
||||
# list(itertools.chain.from_iterable(actions.simi.values.tolist())),
|
||||
# list(itertools.chain.from_iterable(tokens.simi.values.tolist()))]
|
||||
# colNames = ['shapes','actions','tokens']
|
||||
|
||||
ys = []
|
||||
cols = []
|
||||
means = []
|
||||
# plotBox(yList, colNames, 'bugReport' + '.pdf', True)
|
||||
for ds in [shapes, actions, tokens]:
|
||||
ds['ms'] = ds.members.str.len()
|
||||
ds.sort_values(by=['ms'], ascending=False, inplace=True)
|
||||
top10 = ds.head(20)
|
||||
|
||||
colNames = top10.cid.values.tolist()
|
||||
# colNames = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
|
||||
colNames = list(range(len(colNames)))
|
||||
yList = yList = top10.simi.values.tolist()
|
||||
# yList = [np.mean(i) for i in yList]
|
||||
# colNames.insert(0,'ALL')
|
||||
# yList.insert(0,list(itertools.chain.from_iterable(ds.simi.values.tolist())))
|
||||
mean = np.mean(list(itertools.chain.from_iterable(ds.simi.values.tolist())))
|
||||
type = ds.type.iloc[0]
|
||||
# from common.commons import plotBox
|
||||
# plotBox(yList,colNames,type+'.pdf',False)
|
||||
ys.append(yList)
|
||||
cols.append(colNames)
|
||||
means.append(mean)
|
||||
plotBox2(ys, cols, 'test.pdf', means, False)
|
||||
Reference in New Issue
Block a user