from common.commons import * DATA_PATH = os.environ["DATA_PATH"] SPINFER_PATH = os.environ["spinfer"] SPINFER_INDEX_PATH = os.environ["dataset"] COCCI_PATH = join(os.environ["coccinelle"],'spatch') DATASET = os.environ["dataset"] def indexCore(): print(SPINFER_INDEX_PATH) # dbDir = join(DATA_PATH, 'redis') # startDB(dbDir, "6399", "ALLdumps-gumInput.rdb") # import redis # redis_db = redis.StrictRedis(host="localhost", port=6399, db=0) # keys = redis_db.hkeys("dump")#hkeys "dump" # # keys = redis_db.scan(0, match='*', count='1000000') # # matches = pd.DataFrame(keys, columns=['pairs_key']) # # # matches = load_zipped_pickle(join(DATA_PATH,'singleHunks')) # matches['pairs_key'] = matches['pairs_key'].apply(lambda x: x.decode()) # matches['root'] = matches['pairs_key'].apply(lambda x: x.split('/')[0]) # matches['size'] = matches['pairs_key'].apply(lambda x: x.split('/')[1]) # matches['file'] = matches['pairs_key'].apply(lambda x: x.split('/')[2]) # matches['hunk'] = matches['pairs_key'].apply(lambda x: x.split('/')[2].split('_')[-1]) # matches['fileName'] = matches['pairs_key'].apply(lambda x: '_'.join(x.split('/')[2].split('_')[:-1])) # test = matches[['fileName', 'hunk']] # df = test.groupby(by=['fileName'], as_index=False).agg(lambda x: x.tolist()) # # sDF = df[df.hunk.apply(lambda x: True if x == ['0'] else False)] # sDF = df[df.hunk.apply(lambda x: True if len(x)<10005 else False)] # singleHunkedFiles = sDF.fileName.unique().tolist() # singleHunkedFiles = [i.replace('.txt', '') for i in singleHunkedFiles] clusterPath = join(DATA_PATH, 'shapes') roots = listdir(clusterPath) roots = [i for i in roots if not (i.startswith('.') or i.endswith('.pickle'))] for root in roots: root sizes = listdir(join(clusterPath,root)) sizes = [f for f in sizes if not f.startswith('.')] for size in sizes: # actions = listdir(join(clusterPath,root,size)) # for action in actions: clusters = listdir(join(clusterPath,root,size)) clusters = [f for f in clusters if not f.startswith('.')] for cluster in clusters: members = listdir(join(clusterPath,root,size,cluster)) members= [f for f in members if not f.startswith('.')] # members = [re.sub('^linux_','',i) for i in members] # csMembers = {} # for j in [i.split('.txt_') for i in members]: # k,v = j # k = k +'.txt' # if k in csMembers: # tmp = csMembers[k] # tmp.append(v) # csMembers[k] = tmp # else: # csMembers[k] = [v] # # splitMembers = [] # for k,v in csMembers.items(): # hunks = df[df.fileName == k].iloc[0].hunk # if set(hunks) == set(v): # splitMembers.append(k) members = [re.sub('\.txt\_\d+','',i) for i in members] members = list(set(members)) # members = [i for i in members if i in singleHunkedFiles or i in splitMembers] if len(members) > 1: lines = [] for member in members: split =member.split('_') pj = split[0] member ='_'.join(split[1:]) line = pj+'/prevFiles/prev_' + member + ' ' + pj+'/revFiles/' + member + '\n' lines.append(line) if len(lines)>0: with open(join(SPINFER_INDEX_PATH, root+"_"+size +'_'+cluster+'.index'), 'w', encoding='utf-8') as writeFile: # if levelPatch == 0: writeFile.write(''.join(lines)) def test(): indexes = listdir(SPINFER_INDEX_PATH) indexes = [i for i in indexes if i.endswith('.index')] coccis = listdir(join(SPINFER_INDEX_PATH,'cocci')) if not os.path.exists(join(SPINFER_INDEX_PATH,'indexNC')): os.mkdir(join(SPINFER_INDEX_PATH,'indexNC')) for i in indexes: if re.sub('\.index', '.cocci', i) not in coccis: shutil.move(join(SPINFER_INDEX_PATH,i),join(SPINFER_INDEX_PATH,'indexNC',i)) def runSpinfer(): indexCore() indexes = listdir(SPINFER_INDEX_PATH) indexes = [i for i in indexes if i.endswith('.index')] # indexes = ['if_9_44.index'] if not os.path.exists(join(SPINFER_INDEX_PATH,'cocci')): os.mkdir(join(SPINFER_INDEX_PATH,'cocci')) os.chdir(SPINFER_INDEX_PATH) pairs = [] for i in indexes: pairs.append((os.path.getsize(i),i)) pairs.sort(key=lambda s:s[0]) coccis = listdir(join(SPINFER_INDEX_PATH,'cocci')) cmdList = [] bigCmdList = [] for t in pairs: sizes, idx = t cocciName = re.sub('\.index', '.cocci', idx) if cocciName in coccis: continue # cmd = SPINFER_PATH + " --no-progress -f " + idx + " -o cocci/" + cocciName # bigCmdList.append(cmd) if sizes < 500: # #cmd = SPINFER_PATH + " -j 16 -f " + idx + " -o cocci/" + re.sub('\.index', '.cocci', idx) # # cmd = SPINFER_PATH + " --no-progress --genericity 1 -j 16 -f " + idx + " -o cocci/" + cocciName cmd = SPINFER_PATH + " --no-progress -f " + idx + " -o cocci/" + cocciName cmdList.append(cmd) else: # # cmd = SPINFER_PATH + " --no-progress --genericity 1 -j 1 -f " + idx + " -o cocci/" + cocciName cmd = SPINFER_PATH + " --no-progress -f " + idx + " -o cocci/" + cocciName bigCmdList.append(cmd) # for cmd in cmdList: # logging.info(cmd) # output, e = shellGitCheckout(cmd) # logging.info(output) parallelRun(callSpinfer,cmdList) # logging.info('big commands') # for cmd in bigCmdList: # # logging.info(cmd) # # output, e = shellGitCheckout(cmd) # # logging.info(output) parallelRun(callSpinfer,bigCmdList) # if not os.path.exists(join(DATA_PATH,'cocci')): # os.mkdir(join(DATA_PATH,'cocci')) # logging.info(cmd) # output,e = shellGitCheckout(cmd) # logging.info(output) def divideCoccis(): import datetime shutil.copytree(join(SPINFER_INDEX_PATH,'cocci'),join(DATA_PATH,'cocci'+ datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) coccis =os.listdir(join(SPINFER_INDEX_PATH,'cocci')) for cocci in coccis: with open(join(SPINFER_INDEX_PATH,'cocci',cocci), 'r') as iFile: idx = iFile.readlines() idx values = np.array(idx) points = np.where(values == '@@\n') points = list(itertools.chain.from_iterable(points)) if len(points) == 0: os.remove(join(SPINFER_INDEX_PATH, 'cocci', cocci)) # patches = list(zip(*([iter(points)] * 2))) patches = list(pairwise(points[::2]))# every second element in list if len(patches) > 0: i = 0; for t in patches : t with open(join(SPINFER_INDEX_PATH, 'cocci', cocci+str(i)), 'w') as iFile: iFile.writelines(idx[t[0]:t[1]]) i=i+1 t with open(join(SPINFER_INDEX_PATH, 'cocci', cocci + str(i)), 'w') as iFile: iFile.writelines(idx[t[1]:]) os.remove(join(SPINFER_INDEX_PATH, 'cocci', cocci)) def removeDuplicates(): commentPattern = r"(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)|(//.*)" coccis =os.listdir(join(SPINFER_INDEX_PATH, 'cocci')) cocciPatterns = pd.DataFrame(columns=['cid', 'pattern','inferedFrom']) ind = 0 for cocci in coccis: with open(join(SPINFER_INDEX_PATH, 'cocci', cocci), 'r') as iFile: idx = iFile.read() idx inferedFrom = re.search(r"// Infered from:(.*)\n",idx).groups() pattern = re.sub(commentPattern, '', idx, re.DOTALL) cocciPatterns.loc[ind] = [cocci,pattern,inferedFrom] ind = ind +1 cocciPatterns['iFiles'] = cocciPatterns.inferedFrom.apply(lambda x: getInferred(x[0])) save_zipped_pickle(cocciPatterns,join(DATA_PATH,'allCocciPatterns.pickle')) allPatterns = cocciPatterns.cid.values.tolist() uniquePatterns = cocciPatterns.drop_duplicates(subset=['pattern']).cid.values.tolist() toRemove = list(set(allPatterns).difference(uniquePatterns)) print(toRemove) for p in toRemove: os.remove(join(SPINFER_INDEX_PATH, 'cocci', p)) print(len(uniquePatterns)) def filterPatterns(): cocciPatterns = load_zipped_pickle(join(DATA_PATH,'allCocciPatterns.pickle')) cocciPatterns['filtered'] = cocciPatterns.iFiles.apply(lambda x: [i for i in x if ( i.startswith('(libtiff') or i.startswith('(php-src') or i.startswith('(cpython'))]) return cocciPatterns[cocciPatterns.filtered.str.len() > 0].cid.values.tolist() def getInferred(x): regex = r"\(.*?\)" matches = re.finditer(regex, x) results = [] for matchNum, match in enumerate(matches, start=1): results.append(match.group()) return results def getNegLines(x): negLines = r"^-(.*)" matches = re.finditer(negLines, x, re.MULTILINE) res = [] for matchNum, match in enumerate(matches, start=1): for groupNum in range(0, len(match.groups())): groupNum = groupNum + 1 # print("Group {groupNum} found at {start}-{end}: {group}".format(groupNum=groupNum, # start=match.start(groupNum), # end=match.end(groupNum), # group=match.group(groupNum))) res.append(match.group(groupNum)) from common.preprocessing import getTokensForPatterns tokens = getTokensForPatterns(res) return tokens def patternOperations(): cocciPatterns = load_zipped_pickle(join(DATA_PATH, 'allCocciPatterns.pickle')) # cocciPatterns.pattern.apply(lambda x:re.search(r"@@(.*)@@",x,re.DOTALL|re.M).groups()) cocciPatterns['patternTokens'] = cocciPatterns.pattern.apply(lambda x:getNegLines(x)) #set(getTokensForPatterns(lines)).intersection(set(tokens)) save_zipped_pickle(cocciPatterns,join(DATA_PATH, 'allCocciPatterns.pickle')) # import html # def getTokensForPatterns(res): # if isinstance(res, list): # merged = str() # for r in res: # if isinstance(r, list): # merged = merged + ' ' + ' '.join(r) # else: # merged = merged +' ' + r # else: # merged=res # # res = html.unescape(merged) # # tokens = getTokens(res,False) # # stripped = [] # for t in tokens: # splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[|}|{|;',t) # for s in splits: # stripped.append(s) # non_empty = [i for i in stripped if i != ''] # return non_empty # from nltk.tokenize import RegexpTokenizer # # def getTokens(re,printDetail=False): # tokenizer = RegexpTokenizer(r'\S+') # tokens = tokenizer.tokenize(re) # if printDetail: # print('=====TOKENS=========') # print(tokens) # # return tokens def filterCore(t): # cocciPatterns = load_zipped_pickle(join(DATA_PATH, 'allCocciPatterns.pickle')) src,spfile = t srcPath = src patchName = src.split('/')[-1] manybug = src.split('/')[-2] # with open(srcPath, mode='r') as srcFile: # lines = srcFile.read() # scTokens = getTokensForPatterns(lines) # tokens = cocciPatterns[cocciPatterns.cid == spfile].iloc[0].patternTokens # patternFilter = set(scTokens).intersection(set(tokens)) # if len(patternFilter) > 0: cmd = COCCI_PATH + ' --sp-file ' + join(DATASET, 'cocci', spfile) + ' ' + srcPath + ' --patch -o' + join( DATA_PATH, "introclass", manybug, 'patches', patchName) + ' > ' + join(DATA_PATH, "introclass", manybug, 'patches', patchName + spfile + '.txt') # cmd = COCCI_PATH + ' --sp-file ' + join(DATASET,'cocci',spfile) + ' ' + srcPath + ' -o ' + join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt') t = cmd, manybug, patchName, spfile, srcPath return t # return None def patchCoreIntro(): manybugs = listdir(join(DATA_PATH,"introclass")) spfiles = listdir(join(DATASET,'cocci')) # workList = [] filterList =[] for manybug in manybugs: # files = listdir(join(join(DATA_PATH,"manybugs",manybug,'diffs'))) if os.path.exists(join(DATA_PATH, "introclass", manybug, 'patches')): shutil.rmtree(join(DATA_PATH, "introclass", manybug, 'patches')) os.mkdir(join(DATA_PATH, "introclass", manybug, 'patches')) else: os.mkdir(join(DATA_PATH, "introclass", manybug, 'patches')) if os.path.exists(join(DATA_PATH, "introclass", manybug, 'patched')): shutil.rmtree(join(DATA_PATH, "introclass", manybug, 'patched')) os.mkdir(join(DATA_PATH, "introclass", manybug, 'patched')) else: os.mkdir(join(DATA_PATH, "introclass", manybug, 'patched')) files = get_filepaths(join(DATA_PATH, "introclass", manybug), '.c') sources = [i for i in files if not (i.endswith('oracle.c.patch') or i.endswith('oracle.c'))] filterList.extend(list(itertools.product(sources, spfiles))) print(len(filterList)) workList = parallelRunMerge(filterCore,filterList) # for src in sources: # # srcPath = src.replace('/diffs/','/src/') # srcPath = src # # # with open(srcPath, mode='r') as srcFile: # lines = srcFile.read() # scTokens = getTokensForPatterns(lines) # # patchName = src.split('/')[-1] # for spfile in spfiles: # tokens = cocciPatterns[cocciPatterns.cid == spfile].iloc[0].patternTokens # patternFilter = set(scTokens).intersection(set(tokens)) # if len(patternFilter) > 0: # cmd = COCCI_PATH + ' --sp-file ' + join(DATASET,'cocci',spfile) + ' ' + srcPath + ' --patch -o' + join(DATA_PATH,"introclass",manybug,'patches',patchName) + ' > ' + join(DATA_PATH, "introclass", manybug, 'patches', patchName + spfile + '.txt') # # cmd = COCCI_PATH + ' --sp-file ' + join(DATASET,'cocci',spfile) + ' ' + srcPath + ' -o ' + join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt') # t = cmd,manybug,patchName,spfile,srcPath # workList.append(t) # else: # logging.info("Skip "+ srcPath + " " + spfile) workList = list(filter(None, workList)) print(len(workList)) # for l in workList: # cocciCore2(l) parallelRun(cocciCore2,workList) def patched(): manybugs = listdir(join(DATA_PATH,"introclass")) # spfiles = listdir(join(DATASET,'cocci')) workList = [] for manybug in manybugs: # files = listdir(join(join(DATA_PATH,"manybugs",manybug,'diffs'))) if os.path.exists(join(DATA_PATH, "introclass_patched", manybug, 'patches')): shutil.rmtree(join(DATA_PATH, "introclass_patched", manybug, 'patches')) os.makedirs(join(DATA_PATH, "introclass_patched", manybug, 'patches')) else: os.makedirs(join(DATA_PATH, "introclass_patched", manybug, 'patches')) files = get_filepaths(join(DATA_PATH, "introclass", manybug), '.c') sources = [i for i in files if not (i.endswith('oracle.c.patch') or i.endswith('oracle.c'))] for src in sources: # srcPath = src.replace('/diffs/','/src/') srcPath = src patchName = src.split('/')[-1] spfiles = listdir(join(DATA_PATH,"introclass",manybug,'patches')) for spfile in spfiles: spfile =spfile.replace(patchName,'').replace('.txt','') cmd = COCCI_PATH + ' --sp-file ' + join(DATASET,'cocci',spfile) + ' ' + srcPath + ' -o ' + join(DATA_PATH,"introclass_patched",manybug,'patches',patchName+spfile+'.c') t = cmd,manybug,patchName,spfile workList.append(t) parallelRun(cocciCore,workList) def cocciCore2(t): cmd,manybug,patchName,spfile,srcPath = t # cocciPatterns = load_zipped_pickle(join(DATA_PATH, 'allCocciPatterns.pickle')) # tokens = cocciPatterns[cocciPatterns.cid == spfile].iloc[0].patternTokens # with open(srcPath, mode='r') as srcFile: # lines = srcFile.read() # patternFilter = set(getTokensForPatterns(lines)).intersection(set(tokens)) # if len(patternFilter) > 0: # logging.info(cmd) output, e = shellGitCheckout(cmd) # logging.info(output) patchSize = os.path.getsize(join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt')) if patchSize == 0 : os.remove(join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt')) else: cmd = 'patch -d '+join(DATA_PATH, "introclass", manybug)+' -i '+join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt')+' -o '+join(DATA_PATH,"introclass",manybug,'patched',patchName+spfile+'.c') o,e = shellGitCheckout(cmd) def cocciCore(t): cmd, manybug, patchName, spfile = t # logging.info(cmd) output, e = shellGitCheckout(cmd) # logging.info(output) patchSize = os.path.getsize(join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt')) if patchSize == 0 : os.remove(join(DATA_PATH,"introclass",manybug,'patches',patchName+spfile+'.txt')) # def patchCore(): # manybugs = listdir(join(DATA_PATH,"manybugs")) # spfiles = listdir(join(DATASET,'cocci')) # for manybug in manybugs: # # files = listdir(join(join(DATA_PATH,"manybugs",manybug,'diffs'))) # if not os.path.exists(join(DATA_PATH, "manybugs", manybug, 'patches')): # os.mkdir(join(DATA_PATH, "manybugs", manybug, 'patches')) # if not os.path.exists(join(DATA_PATH, "manybugs", manybug, 'patched')): # os.mkdir(join(DATA_PATH, "manybugs", manybug, 'patched')) # # files = get_filepaths(join(DATA_PATH,"manybugs",manybug,'diffs'),'.patch') # sources = [i.replace('.c.patch','.c') for i in files if i.endswith('.c.patch')] # for src in sources: # srcPath = src.replace('/diffs/','/src/') # patchName = src.split('/')[-1] # for spfile in spfiles: # cmd = COCCI_PATH + ' -j 16 --sp-file ' + join(DATASET,'cocci',spfile) + ' ' + srcPath + ' -o ' + join(DATA_PATH,"manybugs",manybug,'patches',patchName+spfile+'.txt') # logging.info(cmd) # output, e = shellGitCheckout(cmd) # logging.info(output) # patchSize = os.path.getsize(join(DATA_PATH,"manybugs",manybug,'patches',patchName+spfile+'.txt')) # if patchSize == 0 : # os.remove(join(DATA_PATH,"manybugs",manybug,'patches',patchName+spfile+'.txt'))