merge python scripts

2020-04-06 21:30:39 +02:00
parent 61a9612345
commit c5463f91f8
75 changed files with 95073 additions and 12 deletions
@@ -0,0 +1,233 @@
+from common.commons import *
+DATA_PATH = os.environ["DATA_PATH"]
+PROJECT_TYPE = os.environ["PROJECT_TYPE"]
+
+
+def bStats():
+    if isfile(join(DATA_PATH, 'studyBugReports.pickle')):
+        studyBugReports = load_zipped_pickle(join(DATA_PATH, 'studyBugReports.pickle'))
+    else:
+        brs = load_zipped_pickle(join(DATA_PATH, args.subject + "bugReportsComplete.pickle"))
+        commits = load_zipped_pickle(join(DATA_PATH, 'singleBR.pickle'))
+
+        dbDir = join(DATA_PATH, 'redis')
+
+        portInner = '6399'
+        startDB(dbDir, portInner, PROJECT_TYPE)
+
+        import redis
+
+        redis_db = redis.StrictRedis(host="localhost", port=portInner, db=0)
+        keys = redis_db.scan(0, match='*', count='1000000')
+
+        matches = pd.DataFrame(keys[1], columns=['pairs_key'])
+
+        # matches = load_zipped_pickle(join(DATA_PATH,'singleHunks'))
+        matches['pairs_key'] = matches['pairs_key'].apply(lambda x: x.decode())
+        matches['root'] = matches['pairs_key'].apply(lambda x: x.split('/')[0])
+        matches['size'] = matches['pairs_key'].apply(lambda x: x.split('/')[1])
+        matches['file'] = matches['pairs_key'].apply(lambda x: x.split('/')[2])
+        matches['repo'] = matches['file'].apply(lambda x: x.split('_')[0])
+        matches['commit'] = matches['file'].apply(lambda x: x.split('_')[1])
+
+        subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
+
+        def getBID(x):
+            try:
+                if x['repo'].endswith('.git'):
+                    return None
+                subject = subjects.query("Repo == '{0}'".format(x['repo'])).Subject.tolist()[0]
+                bids = commits.query(
+                    "commit.str.startswith('{0}') and project== '{1}'".format(x['commit'], subject)).bid.tolist()
+                return bids[0]
+            except Exception as e:
+                logging.error(e)
+
+        matches = matches[~matches.repo.apply(
+            lambda i: (i.startswith('commons-math') or i.startswith('commons-lang') or i.startswith(
+                'closure-compiler') or i.startswith('joda-time') or i.startswith('mockito') or i.startswith(
+                'jfreechart')))]
+        matches['bid'] = matches.apply(lambda x: getBID(x), axis=1)
+
+        subjects
+        # res = pd.merge(matches, brs, on=['bid'])
+        save_zipped_pickle(matches, join(DATA_PATH, 'studyDataset.pickle'))
+        studyBugReports = brs[brs.bid.isin(matches.bid.unique())]
+        save_zipped_pickle(studyBugReports, join(DATA_PATH, 'studyBugReports.pickle'))
+    if isfile(join(DATA_PATH, 'studyBR_DTM_index')):
+        brIndexes = load_zipped_pickle(join(DATA_PATH, 'studyBR_DTM_index'))
+        bugDTM = load_zipped_pickle(join(DATA_PATH, 'studyBR_DTM'))
+        vectorDF = load_zipped_pickle(join(DATA_PATH, 'studyBR_vector'))
+        matches = load_zipped_pickle(join(DATA_PATH, 'studyDataset.pickle'))
+    else:
+        studyBugReports['description'] = studyBugReports['description'].fillna("")
+        studyBugReports['sumDesc'] = studyBugReports['summary'] + studyBugReports['description']
+        # corpus['sumDesc'] = corpus['summary'] + corpus['desc']
+        # from common.preprocessing import
+        # result, aVector = getVectorAndDtm(corpus, 'summary')
+        from common.preprocessing import calculateTfIdfNLList
+
+        corpusBug = studyBugReports['sumDesc'].values.tolist()
+        from common.preprocessing import preprocessingNL
+
+        preCorpusBug = list(map(preprocessingNL, corpusBug))
+
+        v = calculateTfIdfNLList(preCorpusBug)
+        bugDTM = v.transform(preCorpusBug)
+        bugDTM
+        save_zipped_pickle(bugDTM, join(DATA_PATH, 'studyBR_DTM'))
+        brIndexes = studyBugReports['bid'].values.tolist()
+
+        save_zipped_pickle(brIndexes, join(DATA_PATH, 'studyBR_DTM_index'))
+        # from sklearn.metrics.pairwise import cosine_similarity
+        # cosine_similarity(bugDTM[11701], bugDTM[11111])
+        vectorDF = pd.DataFrame(columns=['bid', 'dtm'])
+        # idx = 0
+        for idx, val in enumerate(brIndexes):
+            vectorDF.loc[idx] = [val, bugDTM[idx]]
+        vectorDF
+
+        save_zipped_pickle(vectorDF, join(DATA_PATH, 'studyBR_vector'))
+
+    matches
+    if isfile(join(DATA_PATH, 'study_clusters')):
+        clustersDF = load_zipped_pickle(join(DATA_PATH, 'study_clusters'))
+    else:
+        clustersDF = pd.DataFrame(columns=['cid', 'type', 'members'])
+        idx = 0
+
+        def statsCore(cs, type):
+            global idx
+
+            cs = [i for i in cs if not (i.startswith('commons-math') or i.startswith('commons-lang') or i.startswith(
+                'closure-compiler') or i.startswith('joda-time') or i.startswith('mockito') or i.startswith(
+                'jfreechart'))]
+            # print('Cluster %s : member size %s' % (shape+"-"+size +"-"+cluster, len(cs)))
+            if len(cs) > 0:
+                if token is None:
+                    if action is None:
+                        t = shape + "-" + size + "-" + cluster
+
+                        clustersDF.loc[idx] = [t, type, cs]
+                        idx = idx + 1
+                    else:
+                        t = shape + "-" + size + "-" + cluster + "-" + action  # , len(cs)
+                        clustersDF.loc[idx] = [t, type, cs]
+                        idx = idx + 1
+                else:
+                    # clusterSize = len(cs)
+                    # if clusterSize > 0:
+                    #     clusterSize = len(set([re.split('.txt_[0-9]+', i)[0] for i in cs]))
+                    t = shape + "-" + size + "-" + cluster + "-" + action + "-" + token  # , clusterSize
+                    clustersDF.loc[idx] = [t, type, cs]
+                    idx = idx + 1
+
+        for type in ['tokens', 'actions', 'shapes']:
+            shapesPath = join(DATA_PATH, type)
+            shapes = listdir(shapesPath)
+            shapes = [f for f in shapes if isdir(join(shapesPath, f))]
+            shape = size = cluster = action = token = None
+
+            for shape in shapes:
+                if shape.startswith('.'):
+                    continue
+                sizes = listdir(join(shapesPath, shape))
+
+                for size in sizes:
+                    if size.startswith('.'):
+                        continue
+                    clusters = listdir(join(shapesPath, shape, size))
+                    for cluster in clusters:
+                        if cluster.startswith('.'):
+                            continue
+                        cs = listdir(join(shapesPath, shape, size, cluster))
+
+                        if shapesPath.endswith('shapes'):
+                            cs = listdir(join(shapesPath, shape, size, cluster))
+                            statsCore(cs, 'shapes')
+                        else:
+                            # level3
+                            for action in cs:
+                                if action.startswith('.'):
+                                    continue
+                                tokens = listdir(join(shapesPath, shape, size, cluster, action))
+                                if shapesPath.endswith('actions'):
+                                    statsCore(tokens, 'actions')
+                                else:
+                                    for token in tokens:
+                                        if token.startswith('.'):
+                                            continue
+                                        cs = listdir(join(shapesPath, shape, size, cluster, action, token))
+                                        statsCore(cs, 'tokens')
+
+        clustersDF
+        save_zipped_pickle(clustersDF, join(DATA_PATH, 'study_clusters'))
+        clustersDF
+
+        # selected = clustersDF[clustersDF.type =='shapes']
+
+        from sklearn.metrics.pairwise import cosine_similarity
+        # cosine_similarity(bugDTM[11701], bugDTM[11111])
+        def getSimilarity(x):
+            try:
+                if len(x) == 1:
+                    return [1]
+                else:
+                    filenames = list(set([re.split('.txt_[0-9]+', i)[0] for i in x]))
+                    if len(filenames) == 1:
+                        return [1]
+                    else:
+                        bids2Compare = [matches[matches.file.str.startswith(fn)].bid.unique()[0] for fn in filenames]
+
+                        pairs = list(itertools.combinations(bids2Compare, 2))
+                        pairs
+                        res = []
+                        for p in pairs:
+                            p
+                            simi = cosine_similarity(vectorDF[vectorDF.bid == p[0]].iloc[0].dtm,
+                                                     vectorDF[vectorDF.bid == p[1]].iloc[0].dtm)
+                            res.append(simi[0][0])
+                        return res
+            except Exception as e:
+                logging.error(e)
+
+        # import swifter
+        clustersDF['simi'] = clustersDF.members.apply(lambda x: getSimilarity(x))
+        save_zipped_pickle(clustersDF, join(DATA_PATH, 'study_clusters'))
+
+    clustersDF
+
+    shapes = clustersDF[clustersDF.type == 'shapes']
+    actions = clustersDF[clustersDF.type == 'actions']
+    tokens = clustersDF[clustersDF.type == 'tokens']
+
+    # shapes
+    # yList = [list(itertools.chain.from_iterable(shapes.simi.values.tolist())),
+    # list(itertools.chain.from_iterable(actions.simi.values.tolist())),
+    # list(itertools.chain.from_iterable(tokens.simi.values.tolist()))]
+    # colNames = ['shapes','actions','tokens']
+
+    ys = []
+    cols = []
+    means = []
+    # plotBox(yList, colNames, 'bugReport' + '.pdf', True)
+    for ds in [shapes, actions, tokens]:
+        ds['ms'] = ds.members.str.len()
+        ds.sort_values(by=['ms'], ascending=False, inplace=True)
+        top10 = ds.head(20)
+
+        colNames = top10.cid.values.tolist()
+        # colNames = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
+        colNames = list(range(len(colNames)))
+        yList = yList = top10.simi.values.tolist()
+        # yList = [np.mean(i) for i in yList]
+        # colNames.insert(0,'ALL')
+        # yList.insert(0,list(itertools.chain.from_iterable(ds.simi.values.tolist())))
+        mean = np.mean(list(itertools.chain.from_iterable(ds.simi.values.tolist())))
+        type = ds.type.iloc[0]
+        # from common.commons import plotBox
+        # plotBox(yList,colNames,type+'.pdf',False)
+        ys.append(yList)
+        cols.append(colNames)
+        means.append(mean)
+    plotBox2(ys, cols, 'test.pdf', means, False)