[O] Reformat, optimize, add comments
This commit is contained in:
+27
-17
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from common.commons import *
|
||||
|
||||
@@ -7,46 +9,54 @@ DATA_PATH = os.environ["DATA_PATH"]
|
||||
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
||||
COMMIT_FOLDER = os.environ["COMMIT_FOLDER"]
|
||||
|
||||
def getCommitFromRepo(f,gitrepo,branch):
|
||||
cmd = 'git -C ' + f + ' checkout -f ' + branch
|
||||
|
||||
output, err = shellGitCheckout(cmd)
|
||||
def getCommitFromRepo(f: PathLike, gitrepo: str, branch: str):
|
||||
"""
|
||||
|
||||
:param f: Git repo directory
|
||||
:param gitrepo: Repo name
|
||||
:param branch: Branch name
|
||||
:return: None
|
||||
"""
|
||||
file = f'{gitrepo}.commits'
|
||||
output, err = shellGitCheckout(f'git -C {f} checkout -f {branch}')
|
||||
m = re.search(branch, err)
|
||||
|
||||
while not m:
|
||||
time.sleep(10)
|
||||
logging.info('Waiting for checkout')
|
||||
cmd = 'git -C ' + f + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + gitrepo + '.commits'
|
||||
output = shellCallTemplate(cmd,enc='latin1')
|
||||
|
||||
# Create commits file
|
||||
form = json.dumps({"commit": "%H", "commitDate": "%ci", "title": "%f", "committer": "%ce"})
|
||||
shellCallTemplate(f"git -C {f} log --no-merges --pretty=format:'{form}' > {file}", enc='latin1')
|
||||
|
||||
def makeDF(filename):
|
||||
with open(filename,encoding='latin1') as f:
|
||||
lines = f.readlines()
|
||||
ls = [eval(f) for f in lines]
|
||||
ds = pd.DataFrame.from_dict(ls)
|
||||
ds['commitDate']= ds['commitDate'].apply(lambda x:pd.to_datetime(x))
|
||||
# Collect commits
|
||||
commits = json.loads(f'[{Path(file).read_text()}]')
|
||||
|
||||
# Convert to DataFrame
|
||||
ds = pd.DataFrame.from_dict(commits)
|
||||
ds['commitDate'] = pd.to_datetime(ds['commitDate'])
|
||||
return ds
|
||||
|
||||
|
||||
def caseCollect(subject):
|
||||
|
||||
if not os.path.exists(COMMIT_FOLDER):
|
||||
os.mkdir(COMMIT_FOLDER)
|
||||
if not os.path.exists(COMMIT_DFS):
|
||||
os.mkdir(COMMIT_DFS)
|
||||
|
||||
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
if subject == 'ALL':
|
||||
tuples = subjects[['Repo', 'Branch']].values.tolist()
|
||||
else:
|
||||
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
|
||||
tuples = subjects.query("Subject == '{0}'".format(subject))[['Repo', 'Branch']].values.tolist()
|
||||
tuples = subjects.query("Subject == '{0}'".format(subject))[
|
||||
['Repo', 'Branch']].values.tolist()
|
||||
|
||||
for t in tuples:
|
||||
repo,branch = t
|
||||
repo, branch = t
|
||||
logging.info(repo)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo),branch)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo), branch)
|
||||
|
||||
if subject == 'ALL':
|
||||
commits = listdir(COMMIT_FOLDER)
|
||||
@@ -60,6 +70,7 @@ def caseCollect(subject):
|
||||
save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle"))
|
||||
# p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb"))
|
||||
|
||||
|
||||
def caseClone(subject):
|
||||
if not os.path.exists(REPO_PATH):
|
||||
os.mkdir(REPO_PATH)
|
||||
@@ -74,4 +85,3 @@ def caseClone(subject):
|
||||
for gitrepo in gitrepos:
|
||||
cmd = 'git clone ' + gitrepo
|
||||
out = shellCallTemplate(cmd)
|
||||
|
||||
|
||||
+119
-112
@@ -1,7 +1,8 @@
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import gzip
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
@@ -28,16 +29,18 @@ import datetime
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
PathLike = Union[os.PathLike, str]
|
||||
|
||||
sourceCodeColumns = ['packageName', 'className', 'methodNames', 'formalParameter',
|
||||
'methodInvocation', 'memberReference', 'documentation', 'literal', 'rawSource', 'hunks',
|
||||
'commitLogs', 'classNameExt']
|
||||
'methodInvocation', 'memberReference', 'documentation', 'literal', 'rawSource',
|
||||
'hunks',
|
||||
'commitLogs', 'classNameExt']
|
||||
|
||||
|
||||
def nap():
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def setLogg():
|
||||
# logging.basicConfig(filename='app.log', filemode='w',level=logging.DEBUG)
|
||||
root = logging.getLogger()
|
||||
@@ -45,7 +48,8 @@ def setLogg():
|
||||
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setLevel(logging.WARNING)
|
||||
formatter = logging.Formatter('%(asctime)s - %(process)d - %(levelname)s - %(filename)s:%(funcName)s - %(message)s')
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(process)d - %(levelname)s - %(filename)s:%(funcName)s - %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
# ch.addFilter(lambda record: record.levelno <= logging.)
|
||||
root.addHandler(ch)
|
||||
@@ -64,6 +68,7 @@ def setLogg():
|
||||
h2.setFormatter(formatter)
|
||||
root.addHandler(h2)
|
||||
|
||||
|
||||
def setEnv(args):
|
||||
# env = args.env
|
||||
|
||||
@@ -72,7 +77,6 @@ def setEnv(args):
|
||||
os.environ["ROOT_DIR"] = args.root
|
||||
sys.path.append(args.root)
|
||||
|
||||
|
||||
import yaml
|
||||
# if os.uname().nodename != '':
|
||||
# with open(join(os.environ["ROOT_DIR"], os.uname().nodename + ".config.yml"), 'r') as ymlfile:
|
||||
@@ -113,16 +117,15 @@ def setEnv(args):
|
||||
# os.environ["JDK8"] = cfg['java']['8home']
|
||||
# os.environ["D4JHOME"] = cfg['defects4j']['home']
|
||||
|
||||
|
||||
os.environ["CODE_PATH"] = join(os.environ["ROOT_DIR"],'code/')
|
||||
os.environ["CODE_PATH"] = join(os.environ["ROOT_DIR"], 'code/')
|
||||
# os.environ["DATA_PATH"] = join(os.environ["ROOT_DIR"],'data/')
|
||||
# os.environ["REPO_PATH"] = join(os.environ["DATA_PATH"], 'gitrepo/')
|
||||
os.environ["COMMIT_DFS"]= join(os.environ["DATA_PATH"],'commitsDF/')
|
||||
os.environ["SIMI_DIR"]= join(os.environ["DATA_PATH"],'simi/')
|
||||
os.environ["COMMIT_DFS"] = join(os.environ["DATA_PATH"], 'commitsDF/')
|
||||
os.environ["SIMI_DIR"] = join(os.environ["DATA_PATH"], 'simi/')
|
||||
os.environ["DTM_PATH"] = join(os.environ["DATA_PATH"], 'dtm/')
|
||||
os.environ["SIMI_SINGLE"] = join(os.environ["DATA_PATH"], 'simiSingle/')
|
||||
os.environ["FEATURE_DIR"] = join(os.environ["DATA_PATH"],'features/')
|
||||
|
||||
os.environ["FEATURE_DIR"] = join(os.environ["DATA_PATH"], 'features/')
|
||||
|
||||
os.environ["BUG_POINT"] = join(os.environ["DATA_PATH"], 'bugPoints/')
|
||||
os.environ["DEFECTS4J"] = join(os.environ["DATA_PATH"], 'defects4jdata/')
|
||||
|
||||
@@ -139,10 +142,6 @@ def setEnv(args):
|
||||
os.environ["DATASET_DIR"] = join(os.environ["DATA_PATH"], 'datasets/')
|
||||
os.environ["REMOTE_PATH"] = '/Volumes/Samsung_T5/data'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logging.info('ROOT_DIR : %s', os.environ["ROOT_DIR"])
|
||||
logging.info('REPO_PATH : %s', os.environ["REPO_PATH"])
|
||||
logging.info('CODE_PATH : %s', os.environ["CODE_PATH"])
|
||||
@@ -159,15 +158,13 @@ def setEnv(args):
|
||||
logging.info('DATASET_DIR : %s', os.environ["DATASET_DIR"])
|
||||
|
||||
|
||||
|
||||
def getRun():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
# parser.add_argument('-subject', dest='subject', help='Environment')
|
||||
parser.add_argument('-root', dest='root', help='root folder')
|
||||
parser.add_argument('-job',dest='job',help='job name')
|
||||
parser.add_argument('-prop',dest='prop',help='property file')
|
||||
|
||||
parser.add_argument('-job', dest='job', help='job name')
|
||||
parser.add_argument('-prop', dest='prop', help='property file')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -177,10 +174,9 @@ def getRun():
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def shellCallTemplate4jar(cmd,enc='utf-8'):
|
||||
def shellCallTemplate4jar(cmd, enc='utf-8'):
|
||||
process = subprocess.Popen(cmd,
|
||||
stdout=subprocess.PIPE,stderr=PIPE, shell=True,encoding=enc,
|
||||
stdout=subprocess.PIPE, stderr=PIPE, shell=True, encoding=enc,
|
||||
universal_newlines=True)
|
||||
|
||||
while True:
|
||||
@@ -195,10 +191,11 @@ def shellCallTemplate4jar(cmd,enc='utf-8'):
|
||||
print(output.strip())
|
||||
break
|
||||
|
||||
def shellCallTemplate(cmd,enc='utf-8'):
|
||||
|
||||
def shellCallTemplate(cmd, enc='utf-8'):
|
||||
try:
|
||||
logging.info(cmd)
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p:
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p:
|
||||
output, errors = p.communicate()
|
||||
# print(output)
|
||||
if errors:
|
||||
@@ -212,30 +209,32 @@ def shellCallTemplate(cmd,enc='utf-8'):
|
||||
logging.error(e)
|
||||
return output
|
||||
|
||||
def getChildMem(pid,children):
|
||||
|
||||
def getChildMem(pid, children):
|
||||
out = subprocess.Popen(['pgrep', '-P', str(pid)],
|
||||
stdout=subprocess.PIPE).communicate()[0].split(b'\n')
|
||||
child = out[0].decode()
|
||||
if child !='':
|
||||
if child != '':
|
||||
children.append(child)
|
||||
getChildMem(child,children)
|
||||
getChildMem(child, children)
|
||||
else:
|
||||
return children
|
||||
|
||||
def getAllChildMe(pid):
|
||||
|
||||
def getAllChildMe(pid):
|
||||
childrenProcess = []
|
||||
getChildMem(pid,childrenProcess)
|
||||
getChildMem(pid, childrenProcess)
|
||||
|
||||
# if child == '':
|
||||
return sum(map(memory_usage_ps,childrenProcess)) + memory_usage_ps(pid)
|
||||
return sum(map(memory_usage_ps, childrenProcess)) + memory_usage_ps(pid)
|
||||
# else:
|
||||
# return memory_usage_ps(child) + memory_usage_ps(pid)
|
||||
|
||||
|
||||
def memory_usage_ps(pid):
|
||||
import subprocess
|
||||
out = subprocess.Popen(['ps', 'v', '-p', str(pid)],
|
||||
stdout=subprocess.PIPE).communicate()[0].split(b'\n')
|
||||
stdout=subprocess.PIPE).communicate()[0].split(b'\n')
|
||||
vsz_index = out[0].split().index(b'RSS')
|
||||
if out[1].decode() != '':
|
||||
mem = float(out[1].split()[vsz_index]) / 1024
|
||||
@@ -243,73 +242,74 @@ def memory_usage_ps(pid):
|
||||
mem = float(0)
|
||||
return mem
|
||||
|
||||
def raiseTime(cmd,timeout,my_timer):
|
||||
|
||||
def raiseTime(cmd, timeout, my_timer):
|
||||
my_timer.cancel()
|
||||
raise TimeoutExpired(cmd, timeout)
|
||||
|
||||
|
||||
def killP(pid):
|
||||
out = subprocess.Popen(['kill', str(pid)],
|
||||
stdout=subprocess.PIPE).communicate()[0].split(b'\n')
|
||||
out = subprocess.Popen(['kill', str(pid)], stdout=subprocess.PIPE).communicate()[0].split(b'\n')
|
||||
out
|
||||
|
||||
|
||||
def shellGitCheckout(cmd,timeout =600,enc='utf-8'):
|
||||
def shellGitCheckout(cmd, timeout=600, enc='utf-8'):
|
||||
output = ''
|
||||
errors = ''
|
||||
# logging.debug(cmd)
|
||||
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p:
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p:
|
||||
try:
|
||||
output, errors = p.communicate(timeout=timeout)
|
||||
# print(output)
|
||||
logging.debug(cmd + '\t' +output)
|
||||
logging.debug(cmd + '\t' + output)
|
||||
# logging.info(errors)
|
||||
if errors:
|
||||
raise CalledProcessError(errors, '-1')
|
||||
output
|
||||
except CalledProcessError as e:
|
||||
logging.debug(cmd +'\t'+ errors)
|
||||
logging.debug(cmd + '\t' + errors)
|
||||
except TimeoutExpired as t:
|
||||
p.terminate()
|
||||
p.communicate()
|
||||
# p.kill()
|
||||
logging.warning(cmd +'\t'+str(t))
|
||||
return output,errors
|
||||
logging.warning(cmd + '\t' + str(t))
|
||||
return output, errors
|
||||
|
||||
def callSpinfer(cmd,timeout =600,enc='utf-8'):
|
||||
|
||||
def callSpinfer(cmd, timeout=600, enc='utf-8'):
|
||||
output = ''
|
||||
errors = ''
|
||||
# logging.debug(cmd)
|
||||
my_timer = None
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True,encoding=enc) as p:
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p:
|
||||
try:
|
||||
start = datetime.datetime.now()
|
||||
memusage = getAllChildMe(p.pid)
|
||||
# isExit = False
|
||||
while(memusage != 0.0):
|
||||
while (memusage != 0.0):
|
||||
end = datetime.datetime.now()
|
||||
elapsed = end - start
|
||||
if(elapsed.seconds > timeout):
|
||||
raise TimeoutExpired(cmd,timeout)
|
||||
if (elapsed.seconds > timeout):
|
||||
raise TimeoutExpired(cmd, timeout)
|
||||
memusage = getAllChildMe(p.pid)
|
||||
# print(str(p.pid) + " ; " + str(memusage))
|
||||
if memusage > 2000:
|
||||
# isExit = True
|
||||
raise TimeoutExpired(cmd,timeout)
|
||||
raise TimeoutExpired(cmd, timeout)
|
||||
|
||||
output, errors = p.communicate(timeout=timeout)
|
||||
# print(output)
|
||||
logging.debug(cmd + '\t' +output)
|
||||
logging.debug(cmd + '\t' + output)
|
||||
# logging.info(errors)
|
||||
if errors:
|
||||
raise CalledProcessError(errors, '-1')
|
||||
output
|
||||
except CalledProcessError as e:
|
||||
logging.debug(cmd +'\t'+ errors)
|
||||
logging.debug(cmd + '\t' + errors)
|
||||
except TimeoutExpired as t:
|
||||
# my_timer.cancel()
|
||||
|
||||
|
||||
childrenProcess = []
|
||||
getChildMem(p.pid, childrenProcess)
|
||||
[killP(i) for i in childrenProcess]
|
||||
@@ -317,30 +317,35 @@ def callSpinfer(cmd,timeout =600,enc='utf-8'):
|
||||
p.terminate()
|
||||
p.communicate()
|
||||
# p.kill()
|
||||
logging.warning(cmd +'\t'+str(t))
|
||||
return output,errors
|
||||
logging.warning(cmd + '\t' + str(t))
|
||||
return output, errors
|
||||
|
||||
|
||||
def save_zipped_pickle(obj, filename, protocol=-1):
|
||||
with gzip.open(filename, 'wb') as f:
|
||||
p.dump(obj, f, protocol)
|
||||
|
||||
|
||||
def load_zipped_pickle(filename):
|
||||
with gzip.open(filename, 'rb') as f:
|
||||
loaded_object = p.load(f)
|
||||
return loaded_object
|
||||
|
||||
|
||||
def file2path(file):
|
||||
count = file.count(".") - 1
|
||||
file = file.replace('.', '/', count)
|
||||
return file
|
||||
|
||||
def isFileInList(file,checkList):
|
||||
|
||||
def isFileInList(file, checkList):
|
||||
for f in checkList:
|
||||
if f in file:
|
||||
return True
|
||||
return False
|
||||
# [i for i in ansFiles if 'org/fusesource/esb/itests/basic/fabric/EsbFeatureTest.java' in i]
|
||||
|
||||
|
||||
def get_venn_sections(sets):
|
||||
"""
|
||||
Given a list of sets, return a new list of sets with all the possible
|
||||
@@ -366,7 +371,7 @@ def get_venn_sections(sets):
|
||||
bit_flags = [2 ** n for n in range(len(sets))]
|
||||
flags_zip_sets = [z for z in zip(bit_flags, sets)]
|
||||
|
||||
#combo_sets = []
|
||||
# combo_sets = []
|
||||
combo_sets = dict()
|
||||
for bits in range(num_combinations - 1, 0, -1):
|
||||
include_sets = [s for flag, s in flags_zip_sets if bits & flag]
|
||||
@@ -374,17 +379,19 @@ def get_venn_sections(sets):
|
||||
combo = set.intersection(*include_sets)
|
||||
combo = set.difference(combo, *exclude_sets)
|
||||
tag = ''.join([str(int((bits & flag) > 0)) for flag in bit_flags])
|
||||
#combo_sets.append((tag, combo))
|
||||
# combo_sets.append((tag, combo))
|
||||
combo_sets[tag] = combo
|
||||
return combo_sets
|
||||
|
||||
|
||||
def pairwise(iterable):
|
||||
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
|
||||
a, b = itertools.tee(iterable)
|
||||
next(b, None)
|
||||
return zip(a, b)
|
||||
|
||||
def RR_XGB(x,ao,column):
|
||||
|
||||
def RR_XGB(x, ao, column):
|
||||
if x[ao] == 1:
|
||||
return (1.0 / (x[column]))
|
||||
elif pd.isnull(x[ao]):
|
||||
@@ -392,10 +399,11 @@ def RR_XGB(x,ao,column):
|
||||
else:
|
||||
return 0
|
||||
|
||||
def parallelRunNo(coreFun,elements,*args):
|
||||
|
||||
def parallelRunNo(coreFun, elements, *args):
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=int(8)) as executor:
|
||||
try:
|
||||
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
||||
futures = {executor.submit(coreFun, l, *args): l for l in elements}
|
||||
|
||||
kwargs = {
|
||||
'total': len(futures),
|
||||
@@ -420,10 +428,10 @@ def parallelRunNo(coreFun,elements,*args):
|
||||
raise
|
||||
|
||||
|
||||
def parallelRun(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
def parallelRun(coreFun, elements, *args, max_workers=os.cpu_count()):
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
try:
|
||||
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
||||
futures = {executor.submit(coreFun, l, *args): l for l in elements}
|
||||
|
||||
kwargs = {
|
||||
'total': len(futures),
|
||||
@@ -445,11 +453,11 @@ def parallelRun(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
raise
|
||||
|
||||
|
||||
def parallelRunMerge(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
def parallelRunMerge(coreFun, elements, *args, max_workers=os.cpu_count()):
|
||||
dataL = []
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
try:
|
||||
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
||||
futures = {executor.submit(coreFun, l, *args): l for l in elements}
|
||||
kwargs = {
|
||||
'total': len(futures),
|
||||
'unit': 'files',
|
||||
@@ -473,12 +481,11 @@ def parallelRunMerge(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
raise
|
||||
|
||||
|
||||
def parallelRunMergeNew(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
|
||||
def parallelRunMergeNew(coreFun, elements, *args, max_workers=os.cpu_count()):
|
||||
res = []
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
try:
|
||||
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
||||
futures = {executor.submit(coreFun, l, *args): l for l in elements}
|
||||
|
||||
kwargs = {
|
||||
'total': len(futures),
|
||||
@@ -503,8 +510,8 @@ def parallelRunMergeNew(coreFun,elements,*args,max_workers=os.cpu_count()):
|
||||
aDF = pd.concat(res)
|
||||
return aDF
|
||||
|
||||
def get_filepaths(directory,extension):
|
||||
|
||||
def get_filepaths(directory, extension):
|
||||
file_paths = [] # List which will store all of the full filepaths.\n,
|
||||
exclude = '.git'
|
||||
# Walk the tree.\n,
|
||||
@@ -520,27 +527,27 @@ def get_filepaths(directory,extension):
|
||||
|
||||
return file_paths # Self-explanatory.\n,
|
||||
|
||||
|
||||
def get_class_weights(y):
|
||||
counter = Counter(y)
|
||||
majority = max(counter.values())
|
||||
return {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}
|
||||
return {cls: round(float(majority) / float(count), 2) for cls, count in counter.items()}
|
||||
|
||||
|
||||
def stopDB(dbDir,portInner):
|
||||
def stopDB(dbDir, portInner):
|
||||
# cmd = "bash " + dbDir + "/" + "stopServer.sh " + " " + portInner;
|
||||
cmd = "redis-cli -p " + portInner + " shutdown save"
|
||||
o, e = shellGitCheckout(cmd)
|
||||
logging.info(o)
|
||||
|
||||
|
||||
|
||||
def startDB(dbDir,portInner,projectType):
|
||||
dbName = "dumps-"+projectType+".rdb"
|
||||
def startDB(dbDir, portInner, projectType):
|
||||
dbName = "dumps-" + projectType + ".rdb"
|
||||
# portInner = '6380'
|
||||
cmd = "bash " + dbDir + "/" + "startServer.sh " + dbDir + " "+dbName+ " " + portInner;
|
||||
cmd = "bash " + dbDir + "/" + "startServer.sh " + dbDir + " " + dbName + " " + portInner;
|
||||
|
||||
o, e = shellGitCheckout(cmd)
|
||||
ping = "redis-cli -p "+portInner+" ping"
|
||||
ping = "redis-cli -p " + portInner + " ping"
|
||||
o, e = shellGitCheckout(ping)
|
||||
m = re.search('PONG', o)
|
||||
|
||||
@@ -569,23 +576,23 @@ def unique_everseen(iterable, key=None):
|
||||
seen_add(k)
|
||||
yield element
|
||||
|
||||
def plotBox(yList,labels, fn, rotate=False,limit=True):
|
||||
|
||||
def plotBox(yList, labels, fn, rotate=False, limit=True):
|
||||
import matplotlib
|
||||
matplotlib.use("TkAgg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
|
||||
fig = plt.figure()
|
||||
ax1 = fig.add_subplot(111)
|
||||
meanpointsprops = dict(markeredgecolor ='blue',markerfacecolor=
|
||||
'blue')
|
||||
meanpointsprops = dict(markeredgecolor='blue', markerfacecolor=
|
||||
'blue')
|
||||
|
||||
flierprops = dict(markeredgecolor ='black',markerfacecolor=
|
||||
'black',marker='.',markersize=2)
|
||||
box = ax1.boxplot(yList, 0, flierprops=flierprops,widths=0.5, showmeans=False, vert=True,meanprops=meanpointsprops)
|
||||
flierprops = dict(markeredgecolor='black', markerfacecolor=
|
||||
'black', marker='.', markersize=2)
|
||||
box = ax1.boxplot(yList, 0, flierprops=flierprops, widths=0.5, showmeans=False, vert=True,
|
||||
meanprops=meanpointsprops)
|
||||
for line in box['medians']:
|
||||
x,y = line.get_xydata()[1]
|
||||
x, y = line.get_xydata()[1]
|
||||
line.set(linewidth=3)
|
||||
line.set_color('blue')
|
||||
# plt.scatter(labels, yList, color='r')
|
||||
@@ -601,8 +608,8 @@ def plotBox(yList,labels, fn, rotate=False,limit=True):
|
||||
ax1.get_xaxis().set_ticklabels([])
|
||||
# sns.boxplot(yList, ax=ax1)
|
||||
if limit:
|
||||
ax1.set_ylim(top=1.1,bottom=0)
|
||||
ax1.yaxis.set_ticks([0.0,1.0])
|
||||
ax1.set_ylim(top=1.1, bottom=0)
|
||||
ax1.yaxis.set_ticks([0.0, 1.0])
|
||||
else:
|
||||
ax1.set_yscale('log')
|
||||
ax1.set_xlabel('Cluster Member Size')
|
||||
@@ -616,33 +623,32 @@ def plotBox(yList,labels, fn, rotate=False,limit=True):
|
||||
fig.set_size_inches(7, 1, forward=True)
|
||||
fig.savefig(fn, dpi=100, bbox_inches='tight')
|
||||
|
||||
|
||||
plt.show()
|
||||
|
||||
def plotBox2(ys,labels, fn,means, rotate=False,limit=True):
|
||||
|
||||
|
||||
def plotBox2(ys, labels, fn, means, rotate=False, limit=True):
|
||||
import matplotlib
|
||||
matplotlib.use("TkAgg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, axes = plt.subplots(nrows=3, ncols=1)
|
||||
|
||||
fig,axes = plt.subplots(nrows=3,ncols=1)
|
||||
|
||||
for ax1,yList,l,l2,mean in zip(axes.flat,ys,labels,['Shapes','Actions','Tokens'],means):
|
||||
for ax1, yList, l, l2, mean in zip(axes.flat, ys, labels, ['Shapes', 'Actions', 'Tokens'],
|
||||
means):
|
||||
# plt.setp(ax1.get_xticks(),visible=False)
|
||||
# ax1 = fig.add_subplot(111)
|
||||
meanpointsprops = dict(markeredgecolor ='blue',markerfacecolor=
|
||||
'blue')
|
||||
meanpointsprops = dict(markeredgecolor='blue', markerfacecolor=
|
||||
'blue')
|
||||
|
||||
flierprops = dict(markeredgecolor ='black',markerfacecolor=
|
||||
'black',marker='.',markersize=2)
|
||||
box = ax1.boxplot(yList, 0, flierprops=flierprops,widths=0.5, showmeans=False, vert=True,meanprops=meanpointsprops)
|
||||
flierprops = dict(markeredgecolor='black', markerfacecolor=
|
||||
'black', marker='.', markersize=2)
|
||||
box = ax1.boxplot(yList, 0, flierprops=flierprops, widths=0.5, showmeans=False, vert=True,
|
||||
meanprops=meanpointsprops)
|
||||
|
||||
ax1.axhline(linewidth=2, color='r',y=mean)
|
||||
ax1.axhline(linewidth=2, color='r', y=mean)
|
||||
|
||||
for line in box['medians']:
|
||||
x,y = line.get_xydata()[1]
|
||||
x, y = line.get_xydata()[1]
|
||||
line.set(linewidth=3)
|
||||
line.set_color('blue')
|
||||
# plt.scatter(labels, yList, color='r')
|
||||
@@ -659,14 +665,14 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True):
|
||||
# ax1.get_xaxis().set_ticks([])
|
||||
# sns.boxplot(yList, ax=ax1)
|
||||
if limit:
|
||||
if l2 !='Tokens':
|
||||
ax1.set_ylim(top=1,bottom=0)
|
||||
if l2 != 'Tokens':
|
||||
ax1.set_ylim(top=1, bottom=0)
|
||||
else:
|
||||
ax1.set_ylim(top=1.1, bottom=0)
|
||||
ax1.yaxis.set_ticks([0.0,mean,0.5,1.0])
|
||||
ax1.yaxis.set_ticklabels([0,'',0.5,1])
|
||||
ax1.yaxis.set_ticks([0.0, mean, 0.5, 1.0])
|
||||
ax1.yaxis.set_ticklabels([0, '', 0.5, 1])
|
||||
ax1.tick_params(direction='out', length=6, width=2, axis='y',
|
||||
grid_color='r', grid_alpha=0.5)
|
||||
grid_color='r', grid_alpha=0.5)
|
||||
|
||||
else:
|
||||
# ax1.set_yscale('log')
|
||||
@@ -675,7 +681,7 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True):
|
||||
ax1.set_aspect('auto')
|
||||
|
||||
ax1.set_ylabel(l2)
|
||||
labels = ['C-'+str(i+1) for i in labels[0]]
|
||||
labels = ['C-' + str(i + 1) for i in labels[0]]
|
||||
ax1.set_xticklabels(labels)
|
||||
ax1.set_xticklabels(labels, rotation=45, ha='right')
|
||||
# plt.setp(ax1.get_xticks(), visible=True)
|
||||
@@ -687,16 +693,14 @@ def plotBox2(ys,labels, fn,means, rotate=False,limit=True):
|
||||
plt.subplots_adjust(wspace=0, hspace=0.05)
|
||||
fig = plt.gcf()
|
||||
|
||||
|
||||
# fig.tight_layout()
|
||||
fig.set_size_inches(7, 7, forward=True)
|
||||
fig.savefig(fn, dpi=100, bbox_inches='tight')
|
||||
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def plotScatter(s1,s2,vs,label,limits,type):
|
||||
def plotScatter(s1, s2, vs, label, limits, type):
|
||||
import matplotlib
|
||||
matplotlib.use("TkAgg")
|
||||
import matplotlib.pyplot as plt
|
||||
@@ -719,8 +723,8 @@ def plotScatter(s1,s2,vs,label,limits,type):
|
||||
stepsize = 1
|
||||
ax.xaxis.set_ticks(np.arange(0, end, stepsize))
|
||||
ax.yaxis.set_ticks(np.arange(0, end, stepsize))
|
||||
x = np.linspace(start, end, limits+1)
|
||||
y = np.linspace(start, end, limits+1)
|
||||
x = np.linspace(start, end, limits + 1)
|
||||
y = np.linspace(start, end, limits + 1)
|
||||
ax.fill_between(x, y, end, facecolor='b', alpha=0.3)
|
||||
# plt.plot(np.linspace(0, 1, 10), np.linspace(0, 1, 10), lw=1)
|
||||
ax.spines['top'].set_visible(True)
|
||||
@@ -744,14 +748,17 @@ def plotScatter(s1,s2,vs,label,limits,type):
|
||||
tight_bbox=True
|
||||
)
|
||||
|
||||
|
||||
import threading
|
||||
|
||||
|
||||
class BackgroundTask(object):
|
||||
""" Threading example class
|
||||
The run() method will be started and it will run in the background
|
||||
until the application exits.
|
||||
"""
|
||||
|
||||
def __init__(self, model,PATH, interval=1):
|
||||
def __init__(self, model, PATH, interval=1):
|
||||
""" Constructor
|
||||
:type interval: int
|
||||
:param interval: Check interval, in seconds
|
||||
@@ -761,10 +768,10 @@ class BackgroundTask(object):
|
||||
self.path = PATH
|
||||
|
||||
thread = threading.Thread(target=self.run, args=())
|
||||
thread.daemon = True # Daemonize thread
|
||||
thread.start() # Start the execution
|
||||
thread.daemon = True # Daemonize thread
|
||||
thread.start() # Start the execution
|
||||
|
||||
def run(self):
|
||||
""" Method that runs forever """
|
||||
self.model.save_model(self.path,
|
||||
num_iteration=self.model.best_iteration)
|
||||
num_iteration=self.model.best_iteration)
|
||||
|
||||
+47
-23
@@ -1,46 +1,70 @@
|
||||
from pandas import DataFrame
|
||||
|
||||
from common.commons import *
|
||||
from commitCollector import *
|
||||
from python.settings import *
|
||||
from settings import *
|
||||
|
||||
from otherDatasets import markBugFixingPatches
|
||||
|
||||
|
||||
DATASET_PATH = REPO_PATH
|
||||
DATASET = os.environ["dataset"]
|
||||
PROJECT_LIST = os.environ["PROJECT_LIST"]
|
||||
|
||||
|
||||
def createDS():
|
||||
pjList = PROJECT_LIST.split(',')
|
||||
def load_commits(repo: str, git_url: str, branch: str) -> DataFrame:
|
||||
"""
|
||||
Load commits of a repo
|
||||
|
||||
:param repo: Repo name (e.g. "fuse")
|
||||
:param git_url: Git clone url (e.g. "https://github.com/jboss-fuse/fuse.git")
|
||||
:param branch: Git branch (e.g. "6.3.0.redhat")
|
||||
:return: Commits DataFrame
|
||||
"""
|
||||
commits_pickle = Path(join(COMMIT_DFS, f'{repo}-fix.pickle.gz'))
|
||||
|
||||
# Load existing commits
|
||||
if commits_pickle.is_file():
|
||||
return pd.read_pickle(commits_pickle)
|
||||
|
||||
# Clone new commits
|
||||
shellCallTemplate('git config --global http.postBuffer 157286400')
|
||||
shellCallTemplate(f'git -C {DATASET_PATH} clone {git_url}')
|
||||
logging.info(f'Git repo cloned: {repo}')
|
||||
|
||||
commits = getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo), branch)
|
||||
commits = markBugFixingPatches(commits, repo)
|
||||
commits.to_pickle(commits_pickle)
|
||||
|
||||
return commits
|
||||
|
||||
|
||||
def createDS(project_list: str = PROJECT_LIST):
|
||||
"""
|
||||
|
||||
:param project_list: Comma-separated list of git project names (projects must exist in dataset.csv)
|
||||
:return:
|
||||
"""
|
||||
pjList: list[str] = project_list.split(',')
|
||||
|
||||
# Ensure directories exist
|
||||
if not os.path.exists(DATASET_PATH):
|
||||
os.mkdir(DATASET_PATH)
|
||||
if not os.path.exists(COMMIT_DFS):
|
||||
os.mkdir(COMMIT_DFS)
|
||||
|
||||
subjects = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv'))
|
||||
|
||||
# Find project repo urls in dataset.csv
|
||||
subjects: DataFrame = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv'))
|
||||
if pjList == ['ALL']:
|
||||
tuples = subjects[['Repo', 'GitRepo', 'Branch']].values.tolist()
|
||||
else:
|
||||
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
|
||||
tuples = subjects[subjects.Repo.isin(pjList)][['Repo', 'GitRepo', 'Branch']].values.tolist()
|
||||
|
||||
for t in tuples:
|
||||
repo, src, branch = t
|
||||
logging.info(repo)
|
||||
if isfile(join(COMMIT_DFS, repo + 'Fix.pickle')):
|
||||
commits = load_zipped_pickle(join(COMMIT_DFS, repo + 'Fix.pickle'))
|
||||
else:
|
||||
cmd = 'git config --global http.postBuffer 157286400'
|
||||
shellCallTemplate(cmd)
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo), branch)
|
||||
rDF = makeDF(join(COMMIT_DFS, repo + '.commits'))
|
||||
save_zipped_pickle(rDF, join(COMMIT_DFS, repo + ".pickle"))
|
||||
# return rDF
|
||||
commits = rDF
|
||||
commits = markBugFixingPatches(commits, repo)
|
||||
# Loop through repos
|
||||
for repo, src, branch in tuples:
|
||||
logging.info(f'Processing {repo}')
|
||||
commits = load_commits(repo, src, branch)
|
||||
|
||||
commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))]
|
||||
# keep only commits that are changing c files (.c)
|
||||
commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.java') for i in x.keys()]))]
|
||||
|
||||
+82
-81
@@ -1,24 +1,27 @@
|
||||
from pandas import DataFrame
|
||||
|
||||
from common.commons import *
|
||||
|
||||
DATA_PATH = os.environ["DATA_PATH"]
|
||||
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
||||
# DATASET_PATH = '/Users/anilkoyuncu/projects/datasets'
|
||||
DATASET_PATH = os.environ["REPO_PATH"]
|
||||
DATASET_PATH = Path(os.environ["REPO_PATH"])
|
||||
DATASET = os.environ["dataset"]
|
||||
ROOT = os.environ["ROOT_DIR"]
|
||||
PROJECT_LIST = os.environ["PROJECT_LIST"]
|
||||
|
||||
|
||||
def filetype_fileter(filename):
|
||||
# return filename.endswith(u'.java') and not bool(re.search('test.*\/', filename))
|
||||
return filename.endswith(u'.c') or filename.endswith(u'.h')
|
||||
|
||||
|
||||
|
||||
def checkoutFiles(sha,shaOld, filePath,type, repo=None):
|
||||
def checkoutFiles(sha, shaOld, filePath, type, repo=None):
|
||||
try:
|
||||
# folderDiff = join(DATA_PATH, 'gumInput',repoName, 'DiffEntries')
|
||||
folderDiff = join(type, 'DiffEntries')
|
||||
folderPrev = join(type, 'prevFiles')
|
||||
folderRev = join( type, 'revFiles')
|
||||
folderRev = join(type, 'revFiles')
|
||||
if not os.path.exists(folderDiff):
|
||||
os.mkdir(folderDiff)
|
||||
|
||||
@@ -31,14 +34,13 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None):
|
||||
# if repo is None:
|
||||
# repo = join(REPO_PATH,repoName)
|
||||
|
||||
|
||||
savePath = filePath.replace('/','#')
|
||||
savePath = filePath.replace('/', '#')
|
||||
|
||||
if not isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt'):
|
||||
|
||||
cmd = 'git -C ' + repo + ' diff -U ' + shaOld + ':' + filePath + '..' + sha + ':' + filePath # + '> ' + folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')
|
||||
|
||||
output,errors = shellGitCheckout(cmd,enc='latin1')
|
||||
output, errors = shellGitCheckout(cmd, enc='latin1')
|
||||
if errors:
|
||||
# print(errors)
|
||||
raise FileNotFoundError
|
||||
@@ -58,31 +60,30 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None):
|
||||
'w') as writeFile:
|
||||
writeFile.writelines(diffFile)
|
||||
|
||||
|
||||
|
||||
cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' +savePath
|
||||
cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' + savePath
|
||||
|
||||
if errors:
|
||||
# print(errors)
|
||||
raise FileNotFoundError
|
||||
o,errors= shellGitCheckout(cmd,enc='latin1')
|
||||
cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath
|
||||
o, errors = shellGitCheckout(cmd, enc='latin1')
|
||||
cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath
|
||||
if errors:
|
||||
# print(errors)
|
||||
raise FileNotFoundError
|
||||
|
||||
o,errors = shellGitCheckout(cmd,enc='latin1')
|
||||
o, errors = shellGitCheckout(cmd, enc='latin1')
|
||||
if errors:
|
||||
# print(errors)
|
||||
raise FileNotFoundError
|
||||
|
||||
except FileNotFoundError as fnfe:
|
||||
if isfile(folderRev + '/' + sha + '_' + shaOld + '_' +savePath):
|
||||
os.remove(folderRev + '/' + sha + '_' + shaOld + '_' +savePath)
|
||||
if isfile(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath):
|
||||
os.remove(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath)
|
||||
if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')):
|
||||
os.remove(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt'))
|
||||
if isfile(folderRev + '/' + sha + '_' + shaOld + '_' + savePath):
|
||||
os.remove(folderRev + '/' + sha + '_' + shaOld + '_' + savePath)
|
||||
if isfile(folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath):
|
||||
os.remove(folderPrev + '/' + 'prev_' + sha + '_' + shaOld + '_' + savePath)
|
||||
if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java', '.txt')):
|
||||
os.remove(
|
||||
folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java', '.txt'))
|
||||
# print(fnfe)
|
||||
# raise Exception(fnfe)
|
||||
except Exception as e:
|
||||
@@ -90,14 +91,14 @@ def checkoutFiles(sha,shaOld, filePath,type, repo=None):
|
||||
raise Exception(e)
|
||||
|
||||
|
||||
def prepareFiles(t,dsName):
|
||||
def prepareFiles(t, dsName):
|
||||
try:
|
||||
sha,files = t
|
||||
sha, files = t
|
||||
|
||||
shaOld = sha + '^'
|
||||
# repo = '/Users/anil.koyuncu/projects/linux'
|
||||
# repo = join(REPO_PATH,repoName)
|
||||
gumInputRepo = join(DATASET,dsName)
|
||||
gumInputRepo = join(DATASET, dsName)
|
||||
if not os.path.exists(join(gumInputRepo)):
|
||||
os.makedirs(gumInputRepo)
|
||||
|
||||
@@ -118,35 +119,30 @@ def prepareFiles(t,dsName):
|
||||
# return
|
||||
|
||||
nonTest = []
|
||||
for k,v in files.items():
|
||||
for k, v in files.items():
|
||||
if v == 'M':
|
||||
nonTest.append(k)
|
||||
# if k.endswith('.c') or k.endswith(u'.h'):
|
||||
# nonTest.append(k)
|
||||
# nonTest = [f for f in files.keys() if f.endswith('.c') or f.endswith(u'.h')]
|
||||
|
||||
cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + shaOld
|
||||
out, err = shellGitCheckout(f'git -C {DATASET_PATH / dsName} rev-parse --short=6 {shaOld}', enc='latin1')
|
||||
shaOld = out.strip()
|
||||
|
||||
output, errors = shellGitCheckout(cmd, enc='latin1')
|
||||
shaOld = output.strip()
|
||||
|
||||
cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + sha
|
||||
output, errors = shellGitCheckout(cmd, enc='latin1')
|
||||
sha = output.strip()
|
||||
cmd = 'git -C ' + join(DATASET_PATH, dsName) + ' rev-parse --short=6 ' + sha
|
||||
out, err = shellGitCheckout(cmd, enc='latin1')
|
||||
sha = out.strip()
|
||||
|
||||
if isinstance(nonTest, list):
|
||||
for file in nonTest:
|
||||
checkoutFiles(sha,shaOld, file,gumInputRepo,join(DATASET_PATH,dsName))
|
||||
|
||||
|
||||
checkoutFiles(sha, shaOld, file, gumInputRepo, join(DATASET_PATH, dsName))
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def checkCommitLog(x,dsName):
|
||||
# repo = '/Users/anil.koyuncu/projects/linux'
|
||||
cmd= 'git -C ' + join(DATASET_PATH,dsName) + ' show ' + x + " --pretty=\"format:\" --name-status -M100%"
|
||||
def checkCommitLog(x, dsName):
|
||||
cmd = 'git -C ' + join(DATASET_PATH, dsName) + ' show ' + x + " --pretty=\"format:\" --name-status -M100%"
|
||||
|
||||
out, err = shellGitCheckout(cmd, enc='latin1')
|
||||
log = {}
|
||||
@@ -156,19 +152,21 @@ def checkCommitLog(x,dsName):
|
||||
ftype = line[:1]
|
||||
log[fname] = ftype
|
||||
log
|
||||
df = pd.DataFrame(data=[[log, x]], columns=['files', 'commit'])
|
||||
df = pd.DataFrame(data=[[log, x]], columns=['files', 'commit'])
|
||||
return df
|
||||
|
||||
def getCommitLog(x,dsName):
|
||||
|
||||
def getCommitLog(x, dsName):
|
||||
# repo = '/Users/anil.koyuncu/projects/linux'
|
||||
# commit, repo = x
|
||||
|
||||
cmd = 'git -C ' + join(DATASET_PATH,dsName) + '/ ' + " show --pretty=format:'%B' --no-patch " + x
|
||||
cmd = 'git -C ' + join(DATASET_PATH,
|
||||
dsName) + '/ ' + " show --pretty=format:'%B' --no-patch " + x
|
||||
|
||||
output = shellCallTemplate(cmd, 'latin-1')
|
||||
|
||||
# matches = re.finditer(r"\bfix[a-zA-Z]*", output,re.I)
|
||||
matches = re.finditer(r"\bfix[a-zA-Z]*|\bbug[a-zA-Z]*", output,re.I)
|
||||
matches = re.finditer(r"\bfix[a-zA-Z]*|\bbug[a-zA-Z]*", output, re.I)
|
||||
match = list(matches)
|
||||
fixes = []
|
||||
if len(match) >= 1:
|
||||
@@ -183,32 +181,32 @@ def getCommitLog(x,dsName):
|
||||
# for m in match:
|
||||
# links.append(m.group())
|
||||
|
||||
df = pd.DataFrame(data=[[fixes, output,x]], columns=['fixes','log','commit'])
|
||||
df = pd.DataFrame(data=[[fixes, output, x]], columns=['fixes', 'log', 'commit'])
|
||||
# df = df.T
|
||||
# df.columns = ['log', 'commit']
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
output
|
||||
|
||||
|
||||
def collectBugFixPatches(dsName):
|
||||
commits = getAllCommits(dsName)
|
||||
# remove commits that are only deleting or adding files
|
||||
commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))]
|
||||
# keep only commits that are changing c files (.c)
|
||||
commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.c') for i in x.keys()]))]
|
||||
#not a revert commit
|
||||
# not a revert commit
|
||||
# commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))]
|
||||
# commits = commits[commits.files.apply(lambda x: len(x) == 1)]
|
||||
# commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False)
|
||||
# coccis = commits[commits.cocci].commit.values.tolist()
|
||||
if dsName == 'linux':
|
||||
commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False)
|
||||
commits['cocci'] = commits.log.apply(
|
||||
lambda x: True if re.search('cocci|coccinelle', x) else False)
|
||||
fixes = commits[commits.cocci].commit.values.tolist()
|
||||
else:
|
||||
fixes = commits[commits.fixes.str.len()!=0].commit.values.tolist()
|
||||
fixes = commits[commits.fixes.str.len() != 0].commit.values.tolist()
|
||||
# links = commits[commits.links.str.len()!=0].commit.values.tolist()
|
||||
|
||||
# bugs = set(fixes).union(links).union(coccis)
|
||||
@@ -217,11 +215,11 @@ def collectBugFixPatches(dsName):
|
||||
print(len(commits))
|
||||
# for s in a.commit.values.tolist():
|
||||
|
||||
parallelRun(prepareFiles,commits[['commit','files']].values.tolist(),dsName)
|
||||
# prepareFiles(s)
|
||||
parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), dsName)
|
||||
# prepareFiles(s)
|
||||
|
||||
|
||||
def markBugFixingPatches(commits,dsName):
|
||||
def markBugFixingPatches(commits, dsName):
|
||||
# from pandarallel import pandarallel
|
||||
#
|
||||
# pandarallel.initialize()
|
||||
@@ -229,8 +227,8 @@ def markBugFixingPatches(commits,dsName):
|
||||
# commits
|
||||
|
||||
f = parallelRunMergeNew(checkCommitLog, commits['commit'].values.tolist(), dsName)
|
||||
res = pd.merge(commits, f, on=['commit'])
|
||||
commits=res
|
||||
res: DataFrame = pd.merge(commits, f, on=['commit'])
|
||||
commits = res
|
||||
#
|
||||
# # commits['isC'] = commits.files.apply(lambda x:np.any([i.endswith('.c') or i.endswith('.h') for i in x.keys() ]))
|
||||
# commits['isC'] = commits.files.apply(lambda x:np.all([i.endswith('.c') for i in x.keys() ]))
|
||||
@@ -238,65 +236,65 @@ def markBugFixingPatches(commits,dsName):
|
||||
# commits = commits[commits.isC == True]
|
||||
|
||||
# commits.commit.parallel_apply(getCommitLog)
|
||||
f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist(),dsName)
|
||||
f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist(), dsName)
|
||||
|
||||
res = pd.merge(commits, f, on=['commit'])
|
||||
|
||||
save_zipped_pickle(res, join(COMMIT_DFS, dsName+'Fix' + ".pickle"))
|
||||
return res
|
||||
|
||||
|
||||
def getAllCommits(datasetName):
|
||||
if isfile(join(COMMIT_DFS,datasetName+'Fix.pickle')):
|
||||
return load_zipped_pickle(join(COMMIT_DFS,datasetName+'Fix.pickle'))
|
||||
if isfile(join(COMMIT_DFS, datasetName + 'Fix.pickle')):
|
||||
return load_zipped_pickle(join(COMMIT_DFS, datasetName + 'Fix.pickle'))
|
||||
else:
|
||||
|
||||
if isfile(join(COMMIT_DFS,datasetName+'.pickle')):
|
||||
commits = load_zipped_pickle(join(COMMIT_DFS,datasetName+'.pickle'))
|
||||
if isfile(join(COMMIT_DFS, datasetName + '.pickle')):
|
||||
commits = load_zipped_pickle(join(COMMIT_DFS, datasetName + '.pickle'))
|
||||
else:
|
||||
if not os.path.exists(COMMIT_DFS):
|
||||
os.mkdir(COMMIT_DFS)
|
||||
|
||||
|
||||
cmd = 'git -C ' + join(DATASET_PATH,datasetName) + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join(COMMIT_DFS,datasetName + '.commits')
|
||||
cmd = 'git -C ' + join(DATASET_PATH,
|
||||
datasetName) + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join(
|
||||
COMMIT_DFS, datasetName + '.commits')
|
||||
output = shellCallTemplate(cmd, enc='latin1')
|
||||
|
||||
from commitCollector import makeDF
|
||||
rDF = makeDF(join(COMMIT_DFS,datasetName + '.commits'))
|
||||
rDF = makeDF(join(COMMIT_DFS, datasetName + '.commits'))
|
||||
save_zipped_pickle(rDF, join(COMMIT_DFS, datasetName + ".pickle"))
|
||||
# return rDF
|
||||
commits = rDF
|
||||
return markBugFixingPatches(commits,datasetName)
|
||||
return markBugFixingPatches(commits, datasetName)
|
||||
|
||||
|
||||
def core():
|
||||
datasets = pd.read_csv(join(ROOT,'data', 'datasets.csv'))
|
||||
datasets = pd.read_csv(join(ROOT, 'data', 'datasets.csv'))
|
||||
# repoList = ['FFmpeg','curl','nginx','openssl','redis','tmux','vlc']
|
||||
|
||||
pjList = PROJECT_LIST.split(',')
|
||||
if not os.path.exists(DATASET_PATH):
|
||||
os.mkdir(DATASET_PATH)
|
||||
|
||||
for repo,src in datasets.values.tolist():
|
||||
if(pjList != ['ALL']):
|
||||
for repo, src in datasets.values.tolist():
|
||||
if (pjList != ['ALL']):
|
||||
if repo in pjList:
|
||||
print(repo)
|
||||
cmd = 'git config --global http.postBuffer 157286400'
|
||||
shellCallTemplate(cmd)
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
collectBugFixPatches(repo)
|
||||
print(repo)
|
||||
cmd = 'git config --global http.postBuffer 157286400'
|
||||
shellCallTemplate(cmd)
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
collectBugFixPatches(repo)
|
||||
else:
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
collectBugFixPatches(repo)
|
||||
|
||||
def codeflaws():
|
||||
cf = listdir(join(DATASET_PATH,'codeflaws'))
|
||||
|
||||
type = join(DATASET,'codeflaws')
|
||||
def codeflaws():
|
||||
cf = listdir(join(DATASET_PATH, 'codeflaws'))
|
||||
|
||||
type = join(DATASET, 'codeflaws')
|
||||
folderDiff = join(type, 'DiffEntries')
|
||||
folderPrev = join(type, 'prevFiles')
|
||||
folderRev = join(type, 'revFiles')
|
||||
@@ -308,9 +306,9 @@ def codeflaws():
|
||||
|
||||
if not os.path.exists(folderRev):
|
||||
os.makedirs(folderRev)
|
||||
cfBugs = [i for i in cf if os.path.isdir(join(DATASET_PATH,'codeflaws',i))]
|
||||
cfBugs = [i for i in cf if os.path.isdir(join(DATASET_PATH, 'codeflaws', i))]
|
||||
for cfBug in cfBugs:
|
||||
bugs = [i for i in listdir(join(DATASET_PATH,'codeflaws',cfBug)) if i.endswith('.c')]
|
||||
bugs = [i for i in listdir(join(DATASET_PATH, 'codeflaws', cfBug)) if i.endswith('.c')]
|
||||
bugs.sort()
|
||||
if len(bugs) == 2:
|
||||
s1 = bugs[0].replace('.c', '').split('-')
|
||||
@@ -318,12 +316,15 @@ def codeflaws():
|
||||
prev = s1[-1]
|
||||
rev = s2[-1]
|
||||
bugName = '-'.join(s1[: -1])
|
||||
shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[0]),join(folderPrev,"prev_"+bugName+"-"+prev+"-"+rev+'.c'))
|
||||
shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[1]),join(folderRev,bugName+"-"+prev+"-"+rev+'.c'))
|
||||
cmd = 'diff -u ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[0]) + ' ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[1])+ ' > ' + join(folderDiff,bugName+"-"+prev+"-"+rev+'.c.txt')
|
||||
shutil.copy(join(DATASET_PATH, 'codeflaws', cfBug, bugs[0]),
|
||||
join(folderPrev, "prev_" + bugName + "-" + prev + "-" + rev + '.c'))
|
||||
shutil.copy(join(DATASET_PATH, 'codeflaws', cfBug, bugs[1]),
|
||||
join(folderRev, bugName + "-" + prev + "-" + rev + '.c'))
|
||||
cmd = 'diff -u ' + join(DATASET_PATH, 'codeflaws', cfBug, bugs[0]) + ' ' + join(
|
||||
DATASET_PATH, 'codeflaws', cfBug, bugs[1]) + ' > ' + join(folderDiff,
|
||||
bugName + "-" + prev + "-" + rev + '.c.txt')
|
||||
logging.info(cmd)
|
||||
output, e = shellGitCheckout(cmd)
|
||||
logging.info(output)
|
||||
else:
|
||||
print()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user