diff --git a/python/commitCollector.py b/python/commitCollector.py index d879f85..9e4338c 100644 --- a/python/commitCollector.py +++ b/python/commitCollector.py @@ -1,4 +1,6 @@ +import json +import pandas as pd from common.commons import * @@ -7,46 +9,54 @@ DATA_PATH = os.environ["DATA_PATH"] COMMIT_DFS = os.environ["COMMIT_DFS"] COMMIT_FOLDER = os.environ["COMMIT_FOLDER"] -def getCommitFromRepo(f,gitrepo,branch): - cmd = 'git -C ' + f + ' checkout -f ' + branch - output, err = shellGitCheckout(cmd) +def getCommitFromRepo(f: PathLike, gitrepo: str, branch: str): + """ + + :param f: Git repo directory + :param gitrepo: Repo name + :param branch: Branch name + :return: None + """ + file = f'{gitrepo}.commits' + output, err = shellGitCheckout(f'git -C {f} checkout -f {branch}') m = re.search(branch, err) while not m: time.sleep(10) logging.info('Waiting for checkout') - cmd = 'git -C ' + f + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + gitrepo + '.commits' - output = shellCallTemplate(cmd,enc='latin1') + # Create commits file + form = json.dumps({"commit": "%H", "commitDate": "%ci", "title": "%f", "committer": "%ce"}) + shellCallTemplate(f"git -C {f} log --no-merges --pretty=format:'{form}' > {file}", enc='latin1') -def makeDF(filename): - with open(filename,encoding='latin1') as f: - lines = f.readlines() - ls = [eval(f) for f in lines] - ds = pd.DataFrame.from_dict(ls) - ds['commitDate']= ds['commitDate'].apply(lambda x:pd.to_datetime(x)) + # Collect commits + commits = json.loads(f'[{Path(file).read_text()}]') + + # Convert to DataFrame + ds = pd.DataFrame.from_dict(commits) + ds['commitDate'] = pd.to_datetime(ds['commitDate']) return ds def caseCollect(subject): - if not os.path.exists(COMMIT_FOLDER): os.mkdir(COMMIT_FOLDER) if not os.path.exists(COMMIT_DFS): os.mkdir(COMMIT_DFS) - + subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) if subject == 'ALL': tuples = subjects[['Repo', 'Branch']].values.tolist() else: # repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist() - tuples = subjects.query("Subject == '{0}'".format(subject))[['Repo', 'Branch']].values.tolist() + tuples = subjects.query("Subject == '{0}'".format(subject))[ + ['Repo', 'Branch']].values.tolist() for t in tuples: - repo,branch = t + repo, branch = t logging.info(repo) - getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo),branch) + getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo), branch) if subject == 'ALL': commits = listdir(COMMIT_FOLDER) @@ -60,6 +70,7 @@ def caseCollect(subject): save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle")) # p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb")) + def caseClone(subject): if not os.path.exists(REPO_PATH): os.mkdir(REPO_PATH) @@ -74,4 +85,3 @@ def caseClone(subject): for gitrepo in gitrepos: cmd = 'git clone ' + gitrepo out = shellCallTemplate(cmd) - diff --git a/python/common/commons.py b/python/common/commons.py index f0f15f5..843008f 100644 --- a/python/common/commons.py +++ b/python/common/commons.py @@ -1,7 +1,8 @@ - import logging import sys import gzip +from typing import Union + import numpy as np from tqdm import tqdm import shutil @@ -28,16 +29,18 @@ import datetime import subprocess from pathlib import Path - +PathLike = Union[os.PathLike, str] sourceCodeColumns = ['packageName', 'className', 'methodNames', 'formalParameter', - 'methodInvocation', 'memberReference', 'documentation', 'literal', 'rawSource', 'hunks', - 'commitLogs', 'classNameExt'] + 'methodInvocation', 'memberReference', 'documentation', 'literal', 'rawSource', + 'hunks', + 'commitLogs', 'classNameExt'] def nap(): time.sleep(1) + def setLogg(): # logging.basicConfig(filename='app.log', filemode='w',level=logging.DEBUG) root = logging.getLogger() @@ -45,7 +48,8 @@ def setLogg(): ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.WARNING) - formatter = logging.Formatter('%(asctime)s - %(process)d - %(levelname)s - %(filename)s:%(funcName)s - %(message)s') + formatter = logging.Formatter( + '%(asctime)s - %(process)d - %(levelname)s - %(filename)s:%(funcName)s - %(message)s') ch.setFormatter(formatter) # ch.addFilter(lambda record: record.levelno <= logging.) root.addHandler(ch) @@ -64,6 +68,7 @@ def setLogg(): h2.setFormatter(formatter) root.addHandler(h2) + def setEnv(args): # env = args.env @@ -72,7 +77,6 @@ def setEnv(args): os.environ["ROOT_DIR"] = args.root sys.path.append(args.root) - import yaml # if os.uname().nodename != '': # with open(join(os.environ["ROOT_DIR"], os.uname().nodename + ".config.yml"), 'r') as ymlfile: @@ -113,16 +117,15 @@ def setEnv(args): # os.environ["JDK8"] = cfg['java']['8home'] # os.environ["D4JHOME"] = cfg['defects4j']['home'] - - os.environ["CODE_PATH"] = join(os.environ["ROOT_DIR"],'code/') + os.environ["CODE_PATH"] = join(os.environ["ROOT_DIR"], 'code/') # os.environ["DATA_PATH"] = join(os.environ["ROOT_DIR"],'data/') # os.environ["REPO_PATH"] = join(os.environ["DATA_PATH"], 'gitrepo/') - os.environ["COMMIT_DFS"]= join(os.environ["DATA_PATH"],'commitsDF/') - os.environ["SIMI_DIR"]= join(os.environ["DATA_PATH"],'simi/') + os.environ["COMMIT_DFS"] = join(os.environ["DATA_PATH"], 'commitsDF/') + os.environ["SIMI_DIR"] = join(os.environ["DATA_PATH"], 'simi/') os.environ["DTM_PATH"] = join(os.environ["DATA_PATH"], 'dtm/') os.environ["SIMI_SINGLE"] = join(os.environ["DATA_PATH"], 'simiSingle/') - os.environ["FEATURE_DIR"] = join(os.environ["DATA_PATH"],'features/') - + os.environ["FEATURE_DIR"] = join(os.environ["DATA_PATH"], 'features/') + os.environ["BUG_POINT"] = join(os.environ["DATA_PATH"], 'bugPoints/') os.environ["DEFECTS4J"] = join(os.environ["DATA_PATH"], 'defects4jdata/') @@ -139,10 +142,6 @@ def setEnv(args): os.environ["DATASET_DIR"] = join(os.environ["DATA_PATH"], 'datasets/') os.environ["REMOTE_PATH"] = '/Volumes/Samsung_T5/data' - - - - logging.info('ROOT_DIR : %s', os.environ["ROOT_DIR"]) logging.info('REPO_PATH : %s', os.environ["REPO_PATH"]) logging.info('CODE_PATH : %s', os.environ["CODE_PATH"]) @@ -159,15 +158,13 @@ def setEnv(args): logging.info('DATASET_DIR : %s', os.environ["DATASET_DIR"]) - def getRun(): import argparse parser = argparse.ArgumentParser(description='') # parser.add_argument('-subject', dest='subject', help='Environment') parser.add_argument('-root', dest='root', help='root folder') - parser.add_argument('-job',dest='job',help='job name') - parser.add_argument('-prop',dest='prop',help='property file') - + parser.add_argument('-job', dest='job', help='job name') + parser.add_argument('-prop', dest='prop', help='property file') args = parser.parse_args() @@ -177,10 +174,9 @@ def getRun(): return args - -def shellCallTemplate4jar(cmd,enc='utf-8'): +def shellCallTemplate4jar(cmd, enc='utf-8'): process = subprocess.Popen(cmd, - stdout=subprocess.PIPE,stderr=PIPE, shell=True,encoding=enc, + stdout=subprocess.PIPE, stderr=PIPE, shell=True, encoding=enc, universal_newlines=True) while True: @@ -195,10 +191,11 @@ def shellCallTemplate4jar(cmd,enc='utf-8'): print(output.strip()) break -def shellCallTemplate(cmd,enc='utf-8'): + +def shellCallTemplate(cmd, enc='utf-8'): try: logging.info(cmd) - with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p: + with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p: output, errors = p.communicate() # print(output) if errors: @@ -212,30 +209,32 @@ def shellCallTemplate(cmd,enc='utf-8'): logging.error(e) return output -def getChildMem(pid,children): +def getChildMem(pid, children): out = subprocess.Popen(['pgrep', '-P', str(pid)], stdout=subprocess.PIPE).communicate()[0].split(b'\n') child = out[0].decode() - if child !='': + if child != '': children.append(child) - getChildMem(child,children) + getChildMem(child, children) else: return children -def getAllChildMe(pid): +def getAllChildMe(pid): childrenProcess = [] - getChildMem(pid,childrenProcess) + getChildMem(pid, childrenProcess) # if child == '': - return sum(map(memory_usage_ps,childrenProcess)) + memory_usage_ps(pid) + return sum(map(memory_usage_ps, childrenProcess)) + memory_usage_ps(pid) # else: # return memory_usage_ps(child) + memory_usage_ps(pid) + + def memory_usage_ps(pid): import subprocess out = subprocess.Popen(['ps', 'v', '-p', str(pid)], - stdout=subprocess.PIPE).communicate()[0].split(b'\n') + stdout=subprocess.PIPE).communicate()[0].split(b'\n') vsz_index = out[0].split().index(b'RSS') if out[1].decode() != '': mem = float(out[1].split()[vsz_index]) / 1024 @@ -243,73 +242,74 @@ def memory_usage_ps(pid): mem = float(0) return mem -def raiseTime(cmd,timeout,my_timer): + +def raiseTime(cmd, timeout, my_timer): my_timer.cancel() raise TimeoutExpired(cmd, timeout) + def killP(pid): - out = subprocess.Popen(['kill', str(pid)], - stdout=subprocess.PIPE).communicate()[0].split(b'\n') + out = subprocess.Popen(['kill', str(pid)], stdout=subprocess.PIPE).communicate()[0].split(b'\n') out -def shellGitCheckout(cmd,timeout =600,enc='utf-8'): +def shellGitCheckout(cmd, timeout=600, enc='utf-8'): output = '' errors = '' # logging.debug(cmd) - with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p: + with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p: try: output, errors = p.communicate(timeout=timeout) # print(output) - logging.debug(cmd + '\t' +output) + logging.debug(cmd + '\t' + output) # logging.info(errors) if errors: raise CalledProcessError(errors, '-1') output except CalledProcessError as e: - logging.debug(cmd +'\t'+ errors) + logging.debug(cmd + '\t' + errors) except TimeoutExpired as t: p.terminate() p.communicate() # p.kill() - logging.warning(cmd +'\t'+str(t)) - return output,errors + logging.warning(cmd + '\t' + str(t)) + return output, errors -def callSpinfer(cmd,timeout =600,enc='utf-8'): + +def callSpinfer(cmd, timeout=600, enc='utf-8'): output = '' errors = '' # logging.debug(cmd) my_timer = None - with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p: + with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p: try: start = datetime.datetime.now() memusage = getAllChildMe(p.pid) # isExit = False - while(memusage != 0.0): + while (memusage != 0.0): end = datetime.datetime.now() elapsed = end - start - if(elapsed.seconds > timeout): - raise TimeoutExpired(cmd,timeout) + if (elapsed.seconds > timeout): + raise TimeoutExpired(cmd, timeout) memusage = getAllChildMe(p.pid) # print(str(p.pid) + " ; " + str(memusage)) if memusage > 2000: # isExit = True - raise TimeoutExpired(cmd,timeout) + raise TimeoutExpired(cmd, timeout) output, errors = p.communicate(timeout=timeout) # print(output) - logging.debug(cmd + '\t' +output) + logging.debug(cmd + '\t' + output) # logging.info(errors) if errors: raise CalledProcessError(errors, '-1') output except CalledProcessError as e: - logging.debug(cmd +'\t'+ errors) + logging.debug(cmd + '\t' + errors) except TimeoutExpired as t: # my_timer.cancel() - childrenProcess = [] getChildMem(p.pid, childrenProcess) [killP(i) for i in childrenProcess] @@ -317,30 +317,35 @@ def callSpinfer(cmd,timeout =600,enc='utf-8'): p.terminate() p.communicate() # p.kill() - logging.warning(cmd +'\t'+str(t)) - return output,errors + logging.warning(cmd + '\t' + str(t)) + return output, errors + def save_zipped_pickle(obj, filename, protocol=-1): with gzip.open(filename, 'wb') as f: p.dump(obj, f, protocol) + def load_zipped_pickle(filename): with gzip.open(filename, 'rb') as f: loaded_object = p.load(f) return loaded_object + def file2path(file): count = file.count(".") - 1 file = file.replace('.', '/', count) return file -def isFileInList(file,checkList): + +def isFileInList(file, checkList): for f in checkList: if f in file: return True return False # [i for i in ansFiles if 'org/fusesource/esb/itests/basic/fabric/EsbFeatureTest.java' in i] + def get_venn_sections(sets): """ Given a list of sets, return a new list of sets with all the possible @@ -366,7 +371,7 @@ def get_venn_sections(sets): bit_flags = [2 ** n for n in range(len(sets))] flags_zip_sets = [z for z in zip(bit_flags, sets)] - #combo_sets = [] + # combo_sets = [] combo_sets = dict() for bits in range(num_combinations - 1, 0, -1): include_sets = [s for flag, s in flags_zip_sets if bits & flag] @@ -374,17 +379,19 @@ def get_venn_sections(sets): combo = set.intersection(*include_sets) combo = set.difference(combo, *exclude_sets) tag = ''.join([str(int((bits & flag) > 0)) for flag in bit_flags]) - #combo_sets.append((tag, combo)) + # combo_sets.append((tag, combo)) combo_sets[tag] = combo return combo_sets + def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b) -def RR_XGB(x,ao,column): + +def RR_XGB(x, ao, column): if x[ao] == 1: return (1.0 / (x[column])) elif pd.isnull(x[ao]): @@ -392,10 +399,11 @@ def RR_XGB(x,ao,column): else: return 0 -def parallelRunNo(coreFun,elements,*args): + +def parallelRunNo(coreFun, elements, *args): with concurrent.futures.ProcessPoolExecutor(max_workers=int(8)) as executor: try: - futures = {executor.submit(coreFun, l,*args): l for l in elements} + futures = {executor.submit(coreFun, l, *args): l for l in elements} kwargs = { 'total': len(futures), @@ -420,10 +428,10 @@ def parallelRunNo(coreFun,elements,*args): raise -def parallelRun(coreFun,elements,*args,max_workers=os.cpu_count()): +def parallelRun(coreFun, elements, *args, max_workers=os.cpu_count()): with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: try: - futures = {executor.submit(coreFun, l,*args): l for l in elements} + futures = {executor.submit(coreFun, l, *args): l for l in elements} kwargs = { 'total': len(futures), @@ -445,11 +453,11 @@ def parallelRun(coreFun,elements,*args,max_workers=os.cpu_count()): raise -def parallelRunMerge(coreFun,elements,*args,max_workers=os.cpu_count()): +def parallelRunMerge(coreFun, elements, *args, max_workers=os.cpu_count()): dataL = [] with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: try: - futures = {executor.submit(coreFun, l,*args): l for l in elements} + futures = {executor.submit(coreFun, l, *args): l for l in elements} kwargs = { 'total': len(futures), 'unit': 'files', @@ -473,12 +481,11 @@ def parallelRunMerge(coreFun,elements,*args,max_workers=os.cpu_count()): raise -def parallelRunMergeNew(coreFun,elements,*args,max_workers=os.cpu_count()): - +def parallelRunMergeNew(coreFun, elements, *args, max_workers=os.cpu_count()): res = [] with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: try: - futures = {executor.submit(coreFun, l,*args): l for l in elements} + futures = {executor.submit(coreFun, l, *args): l for l in elements} kwargs = { 'total': len(futures), @@ -503,8 +510,8 @@ def parallelRunMergeNew(coreFun,elements,*args,max_workers=os.cpu_count()): aDF = pd.concat(res) return aDF -def get_filepaths(directory,extension): +def get_filepaths(directory, extension): file_paths = [] # List which will store all of the full filepaths.\n, exclude = '.git' # Walk the tree.\n, @@ -520,27 +527,27 @@ def get_filepaths(directory,extension): return file_paths # Self-explanatory.\n, + def get_class_weights(y): counter = Counter(y) majority = max(counter.values()) - return {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()} + return {cls: round(float(majority) / float(count), 2) for cls, count in counter.items()} -def stopDB(dbDir,portInner): +def stopDB(dbDir, portInner): # cmd = "bash " + dbDir + "/" + "stopServer.sh " + " " + portInner; cmd = "redis-cli -p " + portInner + " shutdown save" o, e = shellGitCheckout(cmd) logging.info(o) - -def startDB(dbDir,portInner,projectType): - dbName = "dumps-"+projectType+".rdb" +def startDB(dbDir, portInner, projectType): + dbName = "dumps-" + projectType + ".rdb" # portInner = '6380' - cmd = "bash " + dbDir + "/" + "startServer.sh " + dbDir + " "+dbName+ " " + portInner; + cmd = "bash " + dbDir + "/" + "startServer.sh " + dbDir + " " + dbName + " " + portInner; o, e = shellGitCheckout(cmd) - ping = "redis-cli -p "+portInner+" ping" + ping = "redis-cli -p " + portInner + " ping" o, e = shellGitCheckout(ping) m = re.search('PONG', o) @@ -569,23 +576,23 @@ def unique_everseen(iterable, key=None): seen_add(k) yield element -def plotBox(yList,labels, fn, rotate=False,limit=True): + +def plotBox(yList, labels, fn, rotate=False, limit=True): import matplotlib matplotlib.use("TkAgg") import matplotlib.pyplot as plt - - fig = plt.figure() ax1 = fig.add_subplot(111) - meanpointsprops = dict(markeredgecolor ='blue',markerfacecolor= - 'blue') + meanpointsprops = dict(markeredgecolor='blue', markerfacecolor= + 'blue') - flierprops = dict(markeredgecolor ='black',markerfacecolor= - 'black',marker='.',markersize=2) - box = ax1.boxplot(yList, 0, flierprops=flierprops,widths=0.5, showmeans=False, vert=True,meanprops=meanpointsprops) + flierprops = dict(markeredgecolor='black', markerfacecolor= + 'black', marker='.', markersize=2) + box = ax1.boxplot(yList, 0, flierprops=flierprops, widths=0.5, showmeans=False, vert=True, + meanprops=meanpointsprops) for line in box['medians']: - x,y = line.get_xydata()[1] + x, y = line.get_xydata()[1] line.set(linewidth=3) line.set_color('blue') # plt.scatter(labels, yList, color='r') @@ -601,8 +608,8 @@ def plotBox(yList,labels, fn, rotate=False,limit=True): ax1.get_xaxis().set_ticklabels([]) # sns.boxplot(yList, ax=ax1) if limit: - ax1.set_ylim(top=1.1,bottom=0) - ax1.yaxis.set_ticks([0.0,1.0]) + ax1.set_ylim(top=1.1, bottom=0) + ax1.yaxis.set_ticks([0.0, 1.0]) else: ax1.set_yscale('log') ax1.set_xlabel('Cluster Member Size') @@ -616,33 +623,32 @@ def plotBox(yList,labels, fn, rotate=False,limit=True): fig.set_size_inches(7, 1, forward=True) fig.savefig(fn, dpi=100, bbox_inches='tight') - plt.show() -def plotBox2(ys,labels, fn,means, rotate=False,limit=True): - +def plotBox2(ys, labels, fn, means, rotate=False, limit=True): import matplotlib matplotlib.use("TkAgg") import matplotlib.pyplot as plt + fig, axes = plt.subplots(nrows=3, ncols=1) - fig,axes = plt.subplots(nrows=3,ncols=1) - - for ax1,yList,l,l2,mean in zip(axes.flat,ys,labels,['Shapes','Actions','Tokens'],means): + for ax1, yList, l, l2, mean in zip(axes.flat, ys, labels, ['Shapes', 'Actions', 'Tokens'], + means): # plt.setp(ax1.get_xticks(),visible=False) # ax1 = fig.add_subplot(111) - meanpointsprops = dict(markeredgecolor ='blue',markerfacecolor= - 'blue') + meanpointsprops = dict(markeredgecolor='blue', markerfacecolor= + 'blue') - flierprops = dict(markeredgecolor ='black',markerfacecolor= - 'black',marker='.',markersize=2) - box = ax1.boxplot(yList, 0, flierprops=flierprops,widths=0.5, showmeans=False, vert=True,meanprops=meanpointsprops) + flierprops = dict(markeredgecolor='black', markerfacecolor= + 'black', marker='.', markersize=2) + box = ax1.boxplot(yList, 0, flierprops=flierprops, widths=0.5, showmeans=False, vert=True, + meanprops=meanpointsprops) - ax1.axhline(linewidth=2, color='r',y=mean) + ax1.axhline(linewidth=2, color='r', y=mean) for line in box['medians']: - x,y = line.get_xydata()[1] + x, y = line.get_xydata()[1] line.set(linewidth=3) line.set_color('blue') # plt.scatter(labels, yList, color='r') @@ -659,14 +665,14 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True): # ax1.get_xaxis().set_ticks([]) # sns.boxplot(yList, ax=ax1) if limit: - if l2 !='Tokens': - ax1.set_ylim(top=1,bottom=0) + if l2 != 'Tokens': + ax1.set_ylim(top=1, bottom=0) else: ax1.set_ylim(top=1.1, bottom=0) - ax1.yaxis.set_ticks([0.0,mean,0.5,1.0]) - ax1.yaxis.set_ticklabels([0,'',0.5,1]) + ax1.yaxis.set_ticks([0.0, mean, 0.5, 1.0]) + ax1.yaxis.set_ticklabels([0, '', 0.5, 1]) ax1.tick_params(direction='out', length=6, width=2, axis='y', - grid_color='r', grid_alpha=0.5) + grid_color='r', grid_alpha=0.5) else: # ax1.set_yscale('log') @@ -675,7 +681,7 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True): ax1.set_aspect('auto') ax1.set_ylabel(l2) - labels = ['C-'+str(i+1) for i in labels[0]] + labels = ['C-' + str(i + 1) for i in labels[0]] ax1.set_xticklabels(labels) ax1.set_xticklabels(labels, rotation=45, ha='right') # plt.setp(ax1.get_xticks(), visible=True) @@ -687,16 +693,14 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True): plt.subplots_adjust(wspace=0, hspace=0.05) fig = plt.gcf() - # fig.tight_layout() fig.set_size_inches(7, 7, forward=True) fig.savefig(fn, dpi=100, bbox_inches='tight') - plt.show() -def plotScatter(s1,s2,vs,label,limits,type): +def plotScatter(s1, s2, vs, label, limits, type): import matplotlib matplotlib.use("TkAgg") import matplotlib.pyplot as plt @@ -719,8 +723,8 @@ def plotScatter(s1,s2,vs,label,limits,type): stepsize = 1 ax.xaxis.set_ticks(np.arange(0, end, stepsize)) ax.yaxis.set_ticks(np.arange(0, end, stepsize)) - x = np.linspace(start, end, limits+1) - y = np.linspace(start, end, limits+1) + x = np.linspace(start, end, limits + 1) + y = np.linspace(start, end, limits + 1) ax.fill_between(x, y, end, facecolor='b', alpha=0.3) # plt.plot(np.linspace(0, 1, 10), np.linspace(0, 1, 10), lw=1) ax.spines['top'].set_visible(True) @@ -744,14 +748,17 @@ def plotScatter(s1,s2,vs,label,limits,type): tight_bbox=True ) + import threading + + class BackgroundTask(object): """ Threading example class The run() method will be started and it will run in the background until the application exits. """ - def __init__(self, model,PATH, interval=1): + def __init__(self, model, PATH, interval=1): """ Constructor :type interval: int :param interval: Check interval, in seconds @@ -761,10 +768,10 @@ class BackgroundTask(object): self.path = PATH thread = threading.Thread(target=self.run, args=()) - thread.daemon = True # Daemonize thread - thread.start() # Start the execution + thread.daemon = True # Daemonize thread + thread.start() # Start the execution def run(self): """ Method that runs forever """ self.model.save_model(self.path, - num_iteration=self.model.best_iteration) \ No newline at end of file + num_iteration=self.model.best_iteration) diff --git a/python/javaDS.py b/python/javaDS.py index de6bb82..3a6b52f 100644 --- a/python/javaDS.py +++ b/python/javaDS.py @@ -1,46 +1,70 @@ +from pandas import DataFrame + from common.commons import * from commitCollector import * -from python.settings import * +from settings import * from otherDatasets import markBugFixingPatches + DATASET_PATH = REPO_PATH DATASET = os.environ["dataset"] PROJECT_LIST = os.environ["PROJECT_LIST"] -def createDS(): - pjList = PROJECT_LIST.split(',') +def load_commits(repo: str, git_url: str, branch: str) -> DataFrame: + """ + Load commits of a repo + + :param repo: Repo name (e.g. "fuse") + :param git_url: Git clone url (e.g. "https://github.com/jboss-fuse/fuse.git") + :param branch: Git branch (e.g. "6.3.0.redhat") + :return: Commits DataFrame + """ + commits_pickle = Path(join(COMMIT_DFS, f'{repo}-fix.pickle.gz')) + + # Load existing commits + if commits_pickle.is_file(): + return pd.read_pickle(commits_pickle) + + # Clone new commits + shellCallTemplate('git config --global http.postBuffer 157286400') + shellCallTemplate(f'git -C {DATASET_PATH} clone {git_url}') + logging.info(f'Git repo cloned: {repo}') + + commits = getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo), branch) + commits = markBugFixingPatches(commits, repo) + commits.to_pickle(commits_pickle) + + return commits + + +def createDS(project_list: str = PROJECT_LIST): + """ + + :param project_list: Comma-separated list of git project names (projects must exist in dataset.csv) + :return: + """ + pjList: list[str] = project_list.split(',') + + # Ensure directories exist if not os.path.exists(DATASET_PATH): os.mkdir(DATASET_PATH) if not os.path.exists(COMMIT_DFS): os.mkdir(COMMIT_DFS) - subjects = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv')) - + # Find project repo urls in dataset.csv + subjects: DataFrame = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv')) if pjList == ['ALL']: tuples = subjects[['Repo', 'GitRepo', 'Branch']].values.tolist() else: - # repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist() tuples = subjects[subjects.Repo.isin(pjList)][['Repo', 'GitRepo', 'Branch']].values.tolist() - for t in tuples: - repo, src, branch = t - logging.info(repo) - if isfile(join(COMMIT_DFS, repo + 'Fix.pickle')): - commits = load_zipped_pickle(join(COMMIT_DFS, repo + 'Fix.pickle')) - else: - cmd = 'git config --global http.postBuffer 157286400' - shellCallTemplate(cmd) - cmd = 'git -C ' + DATASET_PATH + ' clone ' + src - shellCallTemplate(cmd) - logging.info(repo) - getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo), branch) - rDF = makeDF(join(COMMIT_DFS, repo + '.commits')) - save_zipped_pickle(rDF, join(COMMIT_DFS, repo + ".pickle")) - # return rDF - commits = rDF - commits = markBugFixingPatches(commits, repo) + # Loop through repos + for repo, src, branch in tuples: + logging.info(f'Processing {repo}') + commits = load_commits(repo, src, branch) + commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))] # keep only commits that are changing c files (.c) commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.java') for i in x.keys()]))] diff --git a/python/otherDatasets.py b/python/otherDatasets.py index 1aecb49..993439c 100644 --- a/python/otherDatasets.py +++ b/python/otherDatasets.py @@ -1,24 +1,27 @@ +from pandas import DataFrame + from common.commons import * + DATA_PATH = os.environ["DATA_PATH"] COMMIT_DFS = os.environ["COMMIT_DFS"] # DATASET_PATH = '/Users/anilkoyuncu/projects/datasets' -DATASET_PATH = os.environ["REPO_PATH"] +DATASET_PATH = Path(os.environ["REPO_PATH"]) DATASET = os.environ["dataset"] ROOT = os.environ["ROOT_DIR"] PROJECT_LIST = os.environ["PROJECT_LIST"] + def filetype_fileter(filename): # return filename.endswith(u'.java') and not bool(re.search('test.*\/', filename)) return filename.endswith(u'.c') or filename.endswith(u'.h') - -def checkoutFiles(sha,shaOld, filePath,type, repo=None): +def checkoutFiles(sha, shaOld, filePath, type, repo=None): try: # folderDiff = join(DATA_PATH, 'gumInput',repoName, 'DiffEntries') folderDiff = join(type, 'DiffEntries') folderPrev = join(type, 'prevFiles') - folderRev = join( type, 'revFiles') + folderRev = join(type, 'revFiles') if not os.path.exists(folderDiff): os.mkdir(folderDiff) @@ -31,14 +34,13 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None): # if repo is None: # repo = join(REPO_PATH,repoName) - - savePath = filePath.replace('/','#') + savePath = filePath.replace('/', '#') if not isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt'): cmd = 'git -C ' + repo + ' diff -U ' + shaOld + ':' + filePath + '..' + sha + ':' + filePath # + '> ' + folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt') - output,errors = shellGitCheckout(cmd,enc='latin1') + output, errors = shellGitCheckout(cmd, enc='latin1') if errors: # print(errors) raise FileNotFoundError @@ -58,31 +60,30 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None): 'w') as writeFile: writeFile.writelines(diffFile) - - - cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' +savePath + cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' + savePath if errors: # print(errors) raise FileNotFoundError - o,errors= shellGitCheckout(cmd,enc='latin1') - cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath + o, errors = shellGitCheckout(cmd, enc='latin1') + cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath if errors: # print(errors) raise FileNotFoundError - o,errors = shellGitCheckout(cmd,enc='latin1') + o, errors = shellGitCheckout(cmd, enc='latin1') if errors: # print(errors) raise FileNotFoundError except FileNotFoundError as fnfe: - if isfile(folderRev + '/' + sha + '_' + shaOld + '_' +savePath): - os.remove(folderRev + '/' + sha + '_' + shaOld + '_' +savePath) - if isfile(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath): - os.remove(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath) - if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')): - os.remove(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')) + if isfile(folderRev + '/' + sha + '_' + shaOld + '_' + savePath): + os.remove(folderRev + '/' + sha + '_' + shaOld + '_' + savePath) + if isfile(folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath): + os.remove(folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath) + if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java', '.txt')): + os.remove( + folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java', '.txt')) # print(fnfe) # raise Exception(fnfe) except Exception as e: @@ -90,14 +91,14 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None): raise Exception(e) -def prepareFiles(t,dsName): +def prepareFiles(t, dsName): try: - sha,files = t + sha, files = t shaOld = sha + '^' # repo = '/Users/anil.koyuncu/projects/linux' # repo = join(REPO_PATH,repoName) - gumInputRepo = join(DATASET,dsName) + gumInputRepo = join(DATASET, dsName) if not os.path.exists(join(gumInputRepo)): os.makedirs(gumInputRepo) @@ -118,35 +119,30 @@ def prepareFiles(t,dsName): # return nonTest = [] - for k,v in files.items(): + for k, v in files.items(): if v == 'M': nonTest.append(k) # if k.endswith('.c') or k.endswith(u'.h'): # nonTest.append(k) # nonTest = [f for f in files.keys() if f.endswith('.c') or f.endswith(u'.h')] - cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + shaOld + out, err = shellGitCheckout(f'git -C {DATASET_PATH / dsName} rev-parse --short=6 {shaOld}', enc='latin1') + shaOld = out.strip() - output, errors = shellGitCheckout(cmd, enc='latin1') - shaOld = output.strip() - - cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + sha - output, errors = shellGitCheckout(cmd, enc='latin1') - sha = output.strip() + cmd = 'git -C ' + join(DATASET_PATH, dsName) + ' rev-parse --short=6 ' + sha + out, err = shellGitCheckout(cmd, enc='latin1') + sha = out.strip() if isinstance(nonTest, list): for file in nonTest: - checkoutFiles(sha,shaOld, file,gumInputRepo,join(DATASET_PATH,dsName)) - - + checkoutFiles(sha, shaOld, file, gumInputRepo, join(DATASET_PATH, dsName)) except Exception as e: print(e) -def checkCommitLog(x,dsName): - # repo = '/Users/anil.koyuncu/projects/linux' - cmd= 'git -C ' + join(DATASET_PATH,dsName) + ' show ' + x + " --pretty=\"format:\" --name-status -M100%" +def checkCommitLog(x, dsName): + cmd = 'git -C ' + join(DATASET_PATH, dsName) + ' show ' + x + " --pretty=\"format:\" --name-status -M100%" out, err = shellGitCheckout(cmd, enc='latin1') log = {} @@ -156,19 +152,21 @@ def checkCommitLog(x,dsName): ftype = line[:1] log[fname] = ftype log - df = pd.DataFrame(data=[[log, x]], columns=['files', 'commit']) + df = pd.DataFrame(data=[[log, x]], columns=['files', 'commit']) return df -def getCommitLog(x,dsName): + +def getCommitLog(x, dsName): # repo = '/Users/anil.koyuncu/projects/linux' # commit, repo = x - cmd = 'git -C ' + join(DATASET_PATH,dsName) + '/ ' + " show --pretty=format:'%B' --no-patch " + x + cmd = 'git -C ' + join(DATASET_PATH, + dsName) + '/ ' + " show --pretty=format:'%B' --no-patch " + x output = shellCallTemplate(cmd, 'latin-1') # matches = re.finditer(r"\bfix[a-zA-Z]*", output,re.I) - matches = re.finditer(r"\bfix[a-zA-Z]*|\bbug[a-zA-Z]*", output,re.I) + matches = re.finditer(r"\bfix[a-zA-Z]*|\bbug[a-zA-Z]*", output, re.I) match = list(matches) fixes = [] if len(match) >= 1: @@ -183,32 +181,32 @@ def getCommitLog(x,dsName): # for m in match: # links.append(m.group()) - df = pd.DataFrame(data=[[fixes, output,x]], columns=['fixes','log','commit']) + df = pd.DataFrame(data=[[fixes, output, x]], columns=['fixes', 'log', 'commit']) # df = df.T # df.columns = ['log', 'commit'] return df - - output + def collectBugFixPatches(dsName): commits = getAllCommits(dsName) # remove commits that are only deleting or adding files commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))] # keep only commits that are changing c files (.c) commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.c') for i in x.keys()]))] - #not a revert commit + # not a revert commit # commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))] # commits = commits[commits.files.apply(lambda x: len(x) == 1)] # commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False) # coccis = commits[commits.cocci].commit.values.tolist() if dsName == 'linux': - commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False) + commits['cocci'] = commits.log.apply( + lambda x: True if re.search('cocci|coccinelle', x) else False) fixes = commits[commits.cocci].commit.values.tolist() else: - fixes = commits[commits.fixes.str.len()!=0].commit.values.tolist() + fixes = commits[commits.fixes.str.len() != 0].commit.values.tolist() # links = commits[commits.links.str.len()!=0].commit.values.tolist() # bugs = set(fixes).union(links).union(coccis) @@ -217,11 +215,11 @@ def collectBugFixPatches(dsName): print(len(commits)) # for s in a.commit.values.tolist(): - parallelRun(prepareFiles,commits[['commit','files']].values.tolist(),dsName) - # prepareFiles(s) + parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), dsName) + # prepareFiles(s) -def markBugFixingPatches(commits,dsName): +def markBugFixingPatches(commits, dsName): # from pandarallel import pandarallel # # pandarallel.initialize() @@ -229,8 +227,8 @@ def markBugFixingPatches(commits,dsName): # commits f = parallelRunMergeNew(checkCommitLog, commits['commit'].values.tolist(), dsName) - res = pd.merge(commits, f, on=['commit']) - commits=res + res: DataFrame = pd.merge(commits, f, on=['commit']) + commits = res # # # commits['isC'] = commits.files.apply(lambda x:np.any([i.endswith('.c') or i.endswith('.h') for i in x.keys() ])) # commits['isC'] = commits.files.apply(lambda x:np.all([i.endswith('.c') for i in x.keys() ])) @@ -238,65 +236,65 @@ def markBugFixingPatches(commits,dsName): # commits = commits[commits.isC == True] # commits.commit.parallel_apply(getCommitLog) - f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist(),dsName) + f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist(), dsName) res = pd.merge(commits, f, on=['commit']) - - save_zipped_pickle(res, join(COMMIT_DFS, dsName+'Fix' + ".pickle")) return res def getAllCommits(datasetName): - if isfile(join(COMMIT_DFS,datasetName+'Fix.pickle')): - return load_zipped_pickle(join(COMMIT_DFS,datasetName+'Fix.pickle')) + if isfile(join(COMMIT_DFS, datasetName + 'Fix.pickle')): + return load_zipped_pickle(join(COMMIT_DFS, datasetName + 'Fix.pickle')) else: - if isfile(join(COMMIT_DFS,datasetName+'.pickle')): - commits = load_zipped_pickle(join(COMMIT_DFS,datasetName+'.pickle')) + if isfile(join(COMMIT_DFS, datasetName + '.pickle')): + commits = load_zipped_pickle(join(COMMIT_DFS, datasetName + '.pickle')) else: if not os.path.exists(COMMIT_DFS): os.mkdir(COMMIT_DFS) - - cmd = 'git -C ' + join(DATASET_PATH,datasetName) + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join(COMMIT_DFS,datasetName + '.commits') + cmd = 'git -C ' + join(DATASET_PATH, + datasetName) + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join( + COMMIT_DFS, datasetName + '.commits') output = shellCallTemplate(cmd, enc='latin1') from commitCollector import makeDF - rDF = makeDF(join(COMMIT_DFS,datasetName + '.commits')) + rDF = makeDF(join(COMMIT_DFS, datasetName + '.commits')) save_zipped_pickle(rDF, join(COMMIT_DFS, datasetName + ".pickle")) # return rDF commits = rDF - return markBugFixingPatches(commits,datasetName) + return markBugFixingPatches(commits, datasetName) def core(): - datasets = pd.read_csv(join(ROOT,'data', 'datasets.csv')) + datasets = pd.read_csv(join(ROOT, 'data', 'datasets.csv')) # repoList = ['FFmpeg','curl','nginx','openssl','redis','tmux','vlc'] pjList = PROJECT_LIST.split(',') if not os.path.exists(DATASET_PATH): os.mkdir(DATASET_PATH) - for repo,src in datasets.values.tolist(): - if(pjList != ['ALL']): + for repo, src in datasets.values.tolist(): + if (pjList != ['ALL']): if repo in pjList: - print(repo) - cmd = 'git config --global http.postBuffer 157286400' - shellCallTemplate(cmd) - cmd = 'git -C ' + DATASET_PATH + ' clone ' + src - shellCallTemplate(cmd) - logging.info(repo) - collectBugFixPatches(repo) + print(repo) + cmd = 'git config --global http.postBuffer 157286400' + shellCallTemplate(cmd) + cmd = 'git -C ' + DATASET_PATH + ' clone ' + src + shellCallTemplate(cmd) + logging.info(repo) + collectBugFixPatches(repo) else: cmd = 'git -C ' + DATASET_PATH + ' clone ' + src shellCallTemplate(cmd) logging.info(repo) collectBugFixPatches(repo) -def codeflaws(): - cf = listdir(join(DATASET_PATH,'codeflaws')) - type = join(DATASET,'codeflaws') +def codeflaws(): + cf = listdir(join(DATASET_PATH, 'codeflaws')) + + type = join(DATASET, 'codeflaws') folderDiff = join(type, 'DiffEntries') folderPrev = join(type, 'prevFiles') folderRev = join(type, 'revFiles') @@ -308,9 +306,9 @@ def codeflaws(): if not os.path.exists(folderRev): os.makedirs(folderRev) - cfBugs = [i for i in cf if os.path.isdir(join(DATASET_PATH,'codeflaws',i))] + cfBugs = [i for i in cf if os.path.isdir(join(DATASET_PATH, 'codeflaws', i))] for cfBug in cfBugs: - bugs = [i for i in listdir(join(DATASET_PATH,'codeflaws',cfBug)) if i.endswith('.c')] + bugs = [i for i in listdir(join(DATASET_PATH, 'codeflaws', cfBug)) if i.endswith('.c')] bugs.sort() if len(bugs) == 2: s1 = bugs[0].replace('.c', '').split('-') @@ -318,12 +316,15 @@ def codeflaws(): prev = s1[-1] rev = s2[-1] bugName = '-'.join(s1[: -1]) - shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[0]),join(folderPrev,"prev_"+bugName+"-"+prev+"-"+rev+'.c')) - shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[1]),join(folderRev,bugName+"-"+prev+"-"+rev+'.c')) - cmd = 'diff -u ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[0]) + ' ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[1])+ ' > ' + join(folderDiff,bugName+"-"+prev+"-"+rev+'.c.txt') + shutil.copy(join(DATASET_PATH, 'codeflaws', cfBug, bugs[0]), + join(folderPrev, "prev_" + bugName + "-" + prev + "-" + rev + '.c')) + shutil.copy(join(DATASET_PATH, 'codeflaws', cfBug, bugs[1]), + join(folderRev, bugName + "-" + prev + "-" + rev + '.c')) + cmd = 'diff -u ' + join(DATASET_PATH, 'codeflaws', cfBug, bugs[0]) + ' ' + join( + DATASET_PATH, 'codeflaws', cfBug, bugs[1]) + ' > ' + join(folderDiff, + bugName + "-" + prev + "-" + rev + '.c.txt') logging.info(cmd) output, e = shellGitCheckout(cmd) logging.info(output) else: print() -