merge python scripts

2020-04-06 21:30:39 +02:00
parent 61a9612345
commit c5463f91f8
75 changed files with 95073 additions and 12 deletions
@@ -0,0 +1,306 @@
+
+import redis
+
+from common.commons import *
+
+DATA_PATH = os.environ["DATA_PATH"]
+DATASET = os.environ["dataset"]
+jdk8 = os.environ["JDK8"]
+# def localPairCore(aTuple):
+#     redis_db = redis.StrictRedis(host="localhost", port=6380, db=1)
+#     idx, key = aTuple
+#
+#
+#
+#     val = redis_db.get(key)
+#
+#     res = val.decode().split(',')
+#     res.insert(0, key.decode().split('_')[1:])
+#     res.insert(0, key.decode())
+#     # matches.loc[idx] = res
+#     return res
+ast = ["AnonymousClassDeclaration", "ArrayAccess", "ArrayCreation", "ArrayInitializer", "ArrayType", "AssertStatement",
+       "Assignment", "Block", "BooleanLiteral", "BreakStatement", "CastExpression", "CatchClause", "CharacterLiteral",
+       "ClassInstanceCreation", "CompilationUnit", "ConditionalExpression", "ConstructorInvocation",
+       "ContinueStatement", "DoStatement", "EmptyStatement", "ExpressionStatement", "FieldAccess", "FieldDeclaration",
+       "ForStatement", "IfStatement", "ImportDeclaration", "InfixExpression", "Initializer", "Javadoc",
+       "LabeledStatement", "MethodDeclaration", "MethodInvocation", "NullLiteral", "NumberLiteral",
+       "PackageDeclaration", "ParenthesizedExpression", "PostfixExpression", "PrefixExpression", "PrimitiveType",
+       "QualifiedName", "ReturnStatement", "SimpleName", "SimpleType", "SingleVariableDeclaration", "StringLiteral",
+       "SuperConstructorInvocation", "SuperFieldAccess", "SuperMethodInvocation", "SwitchCase", "SwitchStatement",
+       "SynchronizedStatement", "ThisExpression", "ThrowStatement", "TryStatement", "TypeDeclaration",
+       "TypeDeclarationStatement", "TypeLiteral", "VariableDeclarationExpression", "VariableDeclarationFragment",
+       "VariableDeclarationStatement", "WhileStatement", "InstanceofExpression", "LineComment", "BlockComment",
+       "TagElement", "TextElement", "MemberRef", "MethodRef", "MethodRefParameter", "EnhancedForStatement",
+       "EnumDeclaration", "EnumConstantDeclaration", "TypeParameter", "ParameterizedType", "QualifiedType",
+       "WildcardType", "NormalAnnotation", "MarkerAnnotation", "SingleMemberAnnotation", "MemberValuePair",
+       "AnnotationTypeDeclaration", "AnnotationTypeMemberDeclaration", "Modifier", "UnionType", "Dimension",
+       "LambdaExpression", "IntersectionType", "NameQualifiedType", "CreationReference", "ExpressionMethodReference",
+       "SuperMethodReference", "TypeMethodReference", "MethodName", "Operator", "New", "Instanceof"]
+
+ast = ["unit","comment","name","type","condition","block","index","decltype","typename","atomic","assert","generic_selection","selector","association_list",
+       "association","expr_stmt","expr","decl_stmt","decl","range","break","continue","goto","label","typedef","asm","enum","ternary","elseif","while","lock",
+       "fixed","checked","unchecked","unsafe","do","switch","case","default","for","foreach","group","init","incr","function","function_decl","lambda","specifier",
+       "return","call","sizeof","parameter_list","param","krparameter_list","krparam","argument_list","argument","capture","struct","struct_decl","union","union_decl",
+       "class","class_decl","public","private","protected","namespace","using","try","catch","finally","throw","throws","noexcept","template","directive","file","number",
+       "include","define","undef","line","if","ifdef","ifndef","else","elif","endif","then","pragma","error","macro","value","import","constructor_decl","empty_stmt","escape","annotation","alignof","forever","extern"]
+
+movPattern = 'MOV (' + '|'.join(ast) + ')@@(.*)@TO@ (' + '|'.join(ast) + ')@@(.*)@AT@'
+delPattern = 'DEL (' + '|'.join(ast) + ')@@(.*)@AT@'
+insPattern = 'INS (' + '|'.join(ast) + ')@@(.*)@TO@ (' + '|'.join(ast) + ')@@(.*)@AT@'
+updPattern = 'UPD (' + '|'.join(ast) + ')@@(.*)@TO@(.*)@AT@'
+
+def loadPairMulti(root,clusterPath,level):
+
+    # root = 'BreakStatement'
+    logging.info(root)
+    port = 6399
+    # if isfile(clusterPath +"/"+root+".pickle"):
+    #     return load_zipped_pickle(clusterPath +"/"+root+".pickle")
+    # else:
+        # redis_db = redis.StrictRedis(host="localhost", port=port, db=1)  #L1
+    redis_db = redis.StrictRedis(host="localhost", port=port, db=2)
+    keys = redis_db.scan(0, match=root+'-*', count='100000000')
+    # keys = redis_db.hkeys("dump")
+
+    # tuples = []
+    # for idx,key in enumerate(keys[1]):
+    #     t = idx,key
+    #     tuples.append(t)
+
+    # coreNumber = 1600
+    # print('Core number %s' % coreNumber)
+    matches = pd.DataFrame(keys[1],columns=['pairs_key'])
+    matches['pairs_key']=matches['pairs_key'].apply(lambda x:x.decode())
+    # matches['pairs']=matches['pairs_key'].apply(lambda x:x.split('_')[1:])
+    matches['pairs']=matches['pairs_key'].apply(lambda x:x.split(root)[1].split('/')[1:])
+    matches['tuples'] = matches.pairs.apply(lambda x: tuple(x))
+    # matches['path1']=matches['pairs_key'].apply(lambda x:x.split('_')[1])
+    matches['path1']=matches['pairs'].apply(lambda x:x[0])
+    # matches['path2']=matches['pairs_key'].apply(lambda x:x.split('_')[2])
+    matches['path2']=matches['pairs'].apply(lambda x:x[1])
+    # matches['sizes']=matches['pairs_key'].apply(lambda x:x.split('_')[0].split('-')[1])
+    matches['sizes']=matches['pairs_key'].apply(lambda x:x.split(root)[1].split('/')[0].split('-')[1])
+    if level == 'actions':
+        matches['actions']=matches['pairs_key'].apply(lambda x:x.split('/')[0].split('-')[2])
+    if level == 'tokens':
+        matches['actions'] = matches['pairs_key'].apply(lambda x: x.split('/')[0].split('-')[2])
+        matches['tokens']=matches['pairs_key'].apply(lambda x:x.split('/')[0].split('-')[3])
+
+
+    # save_zipped_pickle(matches,clusterPath +"/"+root+".pickle")
+    return matches
+
+
+
+def getMapping(pathMapping,x):
+    pair1,pair2 = x['pairs']
+    p1 = x['path1']
+    p2 = x['path2']
+    pathMapping[pair1] = p1
+    pathMapping[pair2] = p2
+
+
+
+
+
+def cluster(clusterPath,pairsPath, level,rootType):
+
+        try:
+            # logging.info('Parameters: \ninputPath %s \nclusterPath %s \nport %s \nmatchesName %s \nthreshold %s \n%indexFile',inputPath,clusterPath,str(port),matchesName,str(threshold),indexFile)
+            os.makedirs(clusterPath, exist_ok=True)
+            roots = listdir(pairsPath)
+            roots = [i for i in roots if not i.startswith('.')]
+            # roots = [rootType]
+            # parallelRun(loadPairMulti,roots,clusterPath)
+            for root in roots:
+                matches = loadPairMulti(root,clusterPath,level)
+                sizes = matches['sizes'].unique().tolist()
+                for s in sizes:
+                    match = matches[matches['sizes'] == s]
+
+                    if level == 'actions':
+                        actions = match['actions'].unique().tolist()
+                        for action in actions:
+                            match = match[match['actions'] == action]
+                            clusterCore(clusterPath,  level, match, pairsPath, root, s,action)
+                    elif level == 'tokens':
+                        actions = match['actions'].unique().tolist()
+                        for action in actions:
+                            match = match[match['actions'] == action]
+                            tokens = match['tokens'].unique().tolist()
+                            for token in tokens:
+                                match = match[match['tokens']==token]
+                                clusterCore(clusterPath, level, match, pairsPath, root, s, action,token)
+                    else:
+                        clusterCore(clusterPath,  level, match, pairsPath, root, s,'')
+
+            # redis_db = redis.StrictRedis(host="localhost", port=6399, db=2)
+            # redis_db.flushdb()
+            # redis_db = redis.StrictRedis(host="localhost", port=6399, db=1)
+            # redis_db.flushdb()
+            # redis_db = redis.StrictRedis(host="localhost", port=6399, db=0)
+            # redis_db.delete("compare")
+
+
+
+        except Exception as ex:
+            logging.error(ex)
+
+
+def clusterCore(clusterPath, level, match, pairsPath, root, s,action ,token=''):
+    col_combi = match.tuples.values.tolist()
+    import networkx
+    g = networkx.Graph(col_combi)
+    cluster = []
+    for subgraph in networkx.connected_component_subgraphs(g):
+        logging.info('Cluster size %d',len(subgraph.nodes()))
+        cluster.append(subgraph.nodes())
+    cluster
+    pathMapping = dict()
+    if level == 'actions':
+        indexFile = join(pairsPath, root, s,action+'.index')
+    elif level == 'shapes':
+        indexFile = join(pairsPath, root, s + '.index')
+    else:
+        indexFile =join(pairsPath, root, s,action,token+'.index')
+    df = pd.read_csv(indexFile, header=None, usecols=[0, 1], index_col=[0])
+    pathMapping = df.to_dict()
+
+    workList = []
+    for idx, clus in enumerate(cluster):
+        logging.info('exporting cluster %s %s %s %d', root,s,action,idx)
+        for f in clus:
+            dumpFile = pathMapping[1][int(f)]
+
+            t = dumpFile,root,level,clusterPath,s,action,token,idx
+            workList.append(t)
+
+    parallelRun(dumpFilesCore,workList)
+    # for wl in workList:
+    #     dumpFilesCore(wl)
+    # dumpFilesCore(('hive_d65d5c_96c1dc_ql#src#gen#protobuf#gen-java#org#apache#hadoop#hive#ql#io#orc#OrcProto.txt_31', 'ReturnStatement', 'tokens', '/Users/anil.koyuncu/projects/fixminer-all/enhancedASTDiff/python/data/tokens', '3', '1', '0', 10))
+
+
+def dumpFilesCore(t):
+
+    try:
+        dumpFile, root, level, clusterPath, s, action, token, idx = t
+        split = dumpFile.split('_')
+        project = split[0]
+        filename = "_".join(split[1:-1])
+        filePath = join(DATASET, project, 'DiffEntries', filename)
+
+        key = root + '/'+s+'/'+dumpFile
+        jdk8 = os.environ["JDK8"]
+        # cmd = "JAVA_HOME='"+jdk8+"' java -jar "+ join(DATA_PATH,'FixPatternMiner-1.0.1.jar') + " " + join(DATA_PATH,'app.properties')+" PATTERN " +key
+
+        clusterSavePath = join(clusterPath, root, s, str(idx))
+        os.makedirs(clusterSavePath, exist_ok=True)
+        shutil.copy(filePath,join(clusterSavePath,dumpFile))
+        # with open(join(clusterSavePath, dumpFile), 'w', encoding='utf-8') as writeFile:
+        #     writeFile.write(lines)
+
+
+
+        clusterSavePath = ''
+        # if level == 'shapes':
+        #     clusterSavePath = join(clusterPath, root,s, str(idx))
+        #
+        #     o, e = shellGitCheckout(cmd)
+        #     lines = o
+        # elif level == 'actions':
+        #     clusterSavePath = join(clusterPath, root, s,action, str(idx))
+        #
+        #     o, e = shellGitCheckout(cmd)
+        #     lines = o
+        # else:
+        #     clusterSavePath = join(clusterPath, root, s,action,token, str(idx))
+        #     o, e = shellGitCheckout(cmd)
+        #     lines = o
+        #     # with open(filePath, 'r', encoding='utf-8') as fi:
+        #     #     lines = fi.read()
+        #
+        # if level =='shapes' or level=='actions':
+        #
+        #     lines = re.split("@LENGTH@ \d+", lines)
+        #     tokens = []
+        #     for line in lines:
+        #         # levelPatch  = len(re.findall('\w*---', line))
+        #         match = re.search(r"^\w*---+", line,re.M)
+        #
+        #
+        #         if match is not None:
+        #             not_matched, matched = line[:match.start()], match.group()
+        #             levelPatch  = int(len(matched) / 3)
+        #         else:
+        #             levelPatch = 0
+        #         line = line.strip().strip('-')
+        #         type = ''
+        #         if line is '':
+        #             continue
+        #         t = []
+        #         searchPattern = ''
+        #         if line.startswith('INS'):
+        #             if level =='actions':
+        #                 t= [1,3]
+        #             else:
+        #                 t = [1]
+        #             searchPattern = insPattern
+        #             type =' INS '
+        #         elif line.startswith('UPD'):
+        #             t = [1]
+        #             searchPattern = updPattern
+        #             type = ' UPD '
+        #         elif line.startswith('DEL'):
+        #             t = [1]
+        #             searchPattern = delPattern
+        #             type = ' DEL '
+        #         elif line.startswith('MOV'):
+        #             if level == 'actions':
+        #                 t = [1, 3]
+        #             else:
+        #                 t = [1]
+        #             searchPattern = movPattern
+        #             type = ' MOV '
+        #     # from common.preprocessing import preprocessingForSimi
+        #         m = re.search(searchPattern, line, re.DOTALL)
+        #         if t is None:
+        #             print()
+        #         if m:
+        #             for k in t:
+        #                 prefix = '---' * levelPatch
+        #                 if prefix != '':
+        #                     prefix = '\n'+prefix
+        #                 token = m.group(k)
+        #                 if level =='actions':
+        #                     if k ==3:
+        #                         prefix = 'TO '
+        #                     else:
+        #                         prefix = prefix + type
+        #
+        #                 tokens.append(prefix+token)
+        #
+        #     os.makedirs(clusterSavePath, exist_ok=True)
+        #     with open(join(clusterSavePath, dumpFile), 'w', encoding='utf-8') as writeFile:
+        #         # if levelPatch == 0:
+        #         writeFile.write(' '.join(tokens))
+        #         # else:
+        #         #     writeFile.write('\n'.join(tokens))
+        # else:
+        #     os.makedirs(clusterSavePath, exist_ok=True)
+        #     with open(join(clusterSavePath, dumpFile), 'w', encoding='utf-8') as writeFile:
+        #         writeFile.write(lines)
+
+    except Exception as ex:
+        logging.error(t)
+        logging.error(ex)
+        raise ex
+
+
+
+
+
+
+
+