314 lines
12 KiB
Python
314 lines
12 KiB
Python
from common.commons import *
|
|
DATA_PATH = os.environ["DATA_PATH"]
|
|
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
|
# DATASET_PATH = '/Users/anilkoyuncu/projects/datasets'
|
|
DATASET_PATH = os.environ["REPO_PATH"]
|
|
DATASET = os.environ["dataset"]
|
|
ROOT = os.environ["ROOT_DIR"]
|
|
|
|
def filetype_fileter(filename):
|
|
# return filename.endswith(u'.java') and not bool(re.search('test.*\/', filename))
|
|
return filename.endswith(u'.c') or filename.endswith(u'.h')
|
|
|
|
|
|
|
|
def checkoutFiles(sha,shaOld, filePath,type, repo=None):
|
|
try:
|
|
# folderDiff = join(DATA_PATH, 'gumInput',repoName, 'DiffEntries')
|
|
folderDiff = join(type, 'DiffEntries')
|
|
folderPrev = join(type, 'prevFiles')
|
|
folderRev = join( type, 'revFiles')
|
|
if not os.path.exists(folderDiff):
|
|
os.mkdir(folderDiff)
|
|
|
|
if not os.path.exists(folderPrev):
|
|
os.mkdir(folderPrev)
|
|
|
|
if not os.path.exists(folderRev):
|
|
os.mkdir(folderRev)
|
|
|
|
# if repo is None:
|
|
# repo = join(REPO_PATH,repoName)
|
|
|
|
|
|
savePath = filePath.replace('/','#')
|
|
|
|
if not isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt'):
|
|
|
|
cmd = 'git -C ' + repo + ' diff -U ' + shaOld + ':' + filePath + '..' + sha + ':' + filePath # + '> ' + folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')
|
|
|
|
output,errors = shellGitCheckout(cmd,enc='latin1')
|
|
if errors:
|
|
# print(errors)
|
|
raise FileNotFoundError
|
|
|
|
regex = r"@@\s\-\d+,*\d*\s\+\d+,*\d*\s@@ ?(.*\n)*"
|
|
match = re.search(regex, output)
|
|
if not match:
|
|
return
|
|
# print()
|
|
not_matched, matched = output[:match.start()], match.group()
|
|
numberOfHunks = re.findall('@@\s\-\d+,*\d*\s\+\d+,*\d*\s@@', matched)
|
|
if len(numberOfHunks) == 0:
|
|
return
|
|
diffFile = shaOld + '\n' + matched.replace(' @@ ', ' @@\n')
|
|
|
|
with open(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt',
|
|
'w') as writeFile:
|
|
writeFile.writelines(diffFile)
|
|
|
|
|
|
|
|
cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' +savePath
|
|
|
|
if errors:
|
|
# print(errors)
|
|
raise FileNotFoundError
|
|
o,errors= shellGitCheckout(cmd,enc='latin1')
|
|
cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath
|
|
if errors:
|
|
# print(errors)
|
|
raise FileNotFoundError
|
|
|
|
o,errors = shellGitCheckout(cmd,enc='latin1')
|
|
if errors:
|
|
# print(errors)
|
|
raise FileNotFoundError
|
|
|
|
except FileNotFoundError as fnfe:
|
|
if isfile(folderRev + '/' + sha + '_' + shaOld + '_' +savePath):
|
|
os.remove(folderRev + '/' + sha + '_' + shaOld + '_' +savePath)
|
|
if isfile(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath):
|
|
os.remove(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath)
|
|
if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')):
|
|
os.remove(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt'))
|
|
# print(fnfe)
|
|
# raise Exception(fnfe)
|
|
except Exception as e:
|
|
# print(e)
|
|
raise Exception(e)
|
|
|
|
|
|
def prepareFiles(t,dsName):
|
|
try:
|
|
sha,files = t
|
|
|
|
shaOld = sha + '^'
|
|
# repo = '/Users/anil.koyuncu/projects/linux'
|
|
# repo = join(REPO_PATH,repoName)
|
|
gumInputRepo = join(DATASET,dsName)
|
|
if not os.path.exists(join(gumInputRepo)):
|
|
os.makedirs(gumInputRepo)
|
|
|
|
# cmd = 'git -C ' + repo + ' diff --name-only ' + shaOld + '..'+sha
|
|
#
|
|
# output, errors = shellGitCheckout(cmd, 'latin1')
|
|
# files = output.strip().split('\n')
|
|
|
|
# if len(nonJava) > 0:
|
|
# logging.warning('Skipping commit %s',sha)
|
|
# return
|
|
# if len(files) != 1:
|
|
# return
|
|
|
|
# nonTest = [f for f in files if not re.search('test', f, re.I) and f.endswith('.c') or f.endswith(u'.h')]
|
|
#
|
|
# if len(nonTest) > 1:
|
|
# return
|
|
|
|
nonTest = []
|
|
for k,v in files.items():
|
|
if v == 'M':
|
|
if k.endswith('.c') or k.endswith(u'.h'):
|
|
nonTest.append(k)
|
|
# nonTest = [f for f in files.keys() if f.endswith('.c') or f.endswith(u'.h')]
|
|
|
|
cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + shaOld
|
|
|
|
output, errors = shellGitCheckout(cmd, enc='latin1')
|
|
shaOld = output.strip()
|
|
|
|
cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + sha
|
|
output, errors = shellGitCheckout(cmd, enc='latin1')
|
|
sha = output.strip()
|
|
|
|
if isinstance(nonTest, list):
|
|
for file in nonTest:
|
|
checkoutFiles(sha,shaOld, file,gumInputRepo,join(DATASET_PATH,dsName))
|
|
|
|
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
|
|
def checkCommitLog(x,dsName):
|
|
# repo = '/Users/anil.koyuncu/projects/linux'
|
|
cmd= 'git -C ' + join(DATASET_PATH,dsName) + ' show ' + x + " --pretty=\"format:\" --name-status -M100%"
|
|
|
|
out, err = shellGitCheckout(cmd, enc='latin1')
|
|
log = {}
|
|
lines = out.strip().split('\n')
|
|
for line in lines:
|
|
fname = line[2:].strip()
|
|
ftype = line[:1]
|
|
log[fname] = ftype
|
|
log
|
|
df = pd.DataFrame(data=[[log, x]], columns=['files', 'commit'])
|
|
return df
|
|
|
|
def getCommitLog(x,dsName):
|
|
# repo = '/Users/anil.koyuncu/projects/linux'
|
|
# commit, repo = x
|
|
|
|
cmd = 'git -C ' + join(DATASET_PATH,dsName) + '/ ' + " show --pretty=format:'%B' --no-patch " + x
|
|
|
|
output = shellCallTemplate(cmd, 'latin-1')
|
|
|
|
# matches = re.finditer(r"\bfix[a-zA-Z]*", output,re.I)
|
|
matches = re.finditer(r"\bfix[a-zA-Z]*|\bbug[a-zA-Z]*", output,re.I)
|
|
match = list(matches)
|
|
fixes = []
|
|
if len(match) >= 1:
|
|
for m in match:
|
|
fixes.append(m.group())
|
|
# return match[0].group()
|
|
|
|
# matches = re.finditer('http[s]?:\/\/.*show_bug\.cgi\?id=[0-9]*',output)
|
|
# match = list(matches)
|
|
# links = []
|
|
# if len(match) >= 1:
|
|
# for m in match:
|
|
# links.append(m.group())
|
|
|
|
df = pd.DataFrame(data=[[fixes, output,x]], columns=['fixes','log','commit'])
|
|
# df = df.T
|
|
# df.columns = ['log', 'commit']
|
|
|
|
return df
|
|
|
|
|
|
|
|
output
|
|
|
|
def collectBugFixPatches(dsName):
|
|
commits = getAllCommits(dsName)
|
|
# remove commits that are only deleting or adding files
|
|
commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))]
|
|
# keep only commits that are changing c files (.c)
|
|
commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.c') for i in x.keys()]))]
|
|
#not a revert commit
|
|
# commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))]
|
|
# commits = commits[commits.files.apply(lambda x: len(x) == 1)]
|
|
# commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False)
|
|
# coccis = commits[commits.cocci].commit.values.tolist()
|
|
fixes = commits[commits.fixes.str.len()!=0].commit.values.tolist()
|
|
# links = commits[commits.links.str.len()!=0].commit.values.tolist()
|
|
|
|
# bugs = set(fixes).union(links).union(coccis)
|
|
# bugs = set(fixes)#.union(coccis)
|
|
commits = commits[commits.commit.isin(fixes)]
|
|
print(len(commits))
|
|
# for s in a.commit.values.tolist():
|
|
|
|
parallelRun(prepareFiles,commits[['commit','files']].values.tolist(),dsName)
|
|
# prepareFiles(s)
|
|
|
|
|
|
def markBugFixingPatches(commits,dsName):
|
|
# from pandarallel import pandarallel
|
|
#
|
|
# pandarallel.initialize()
|
|
# commits['files'] = commits.commit.parallel_apply(checkCommitLog)
|
|
# commits
|
|
|
|
f = parallelRunMergeNew(checkCommitLog, commits['commit'].values.tolist(), dsName)
|
|
res = pd.merge(commits, f, on=['commit'])
|
|
commits=res
|
|
#
|
|
# # commits['isC'] = commits.files.apply(lambda x:np.any([i.endswith('.c') or i.endswith('.h') for i in x.keys() ]))
|
|
# commits['isC'] = commits.files.apply(lambda x:np.all([i.endswith('.c') for i in x.keys() ]))
|
|
#
|
|
# commits = commits[commits.isC == True]
|
|
|
|
# commits.commit.parallel_apply(getCommitLog)
|
|
f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist(),dsName)
|
|
|
|
res = pd.merge(commits, f, on=['commit'])
|
|
|
|
save_zipped_pickle(res, join(COMMIT_DFS, dsName+'Fix' + ".pickle"))
|
|
return res
|
|
|
|
|
|
def getAllCommits(datasetName):
|
|
if isfile(join(COMMIT_DFS,datasetName+'Fix.pickle')):
|
|
return load_zipped_pickle(join(COMMIT_DFS,datasetName+'Fix.pickle'))
|
|
else:
|
|
|
|
if isfile(join(COMMIT_DFS,datasetName+'.pickle')):
|
|
commits = load_zipped_pickle(join(COMMIT_DFS,datasetName+'.pickle'))
|
|
else:
|
|
if not os.path.exists(COMMIT_DFS):
|
|
os.mkdir(COMMIT_DFS)
|
|
|
|
|
|
cmd = 'git -C ' + join(DATASET_PATH,datasetName) + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join(COMMIT_DFS,datasetName + '.commits')
|
|
output = shellCallTemplate(cmd, enc='latin1')
|
|
|
|
from commitCollector import makeDF
|
|
rDF = makeDF(join(COMMIT_DFS,datasetName + '.commits'))
|
|
save_zipped_pickle(rDF, join(COMMIT_DFS, datasetName + ".pickle"))
|
|
# return rDF
|
|
commits = rDF
|
|
return markBugFixingPatches(commits,datasetName)
|
|
|
|
|
|
def core():
|
|
datasets = pd.read_csv(join(ROOT,'data', 'datasets.csv'))
|
|
# repoList = ['FFmpeg','curl','nginx','openssl','redis','tmux','vlc']
|
|
pjs = listdir(DATASET_PATH)
|
|
# newRepo = ['php-src','libtiff','cpython']
|
|
pjs = [i for i in pjs if not (i.startswith('.') or i.startswith('codeflaws'))]
|
|
# pjs = [i for i in pjs if (i in newRepo )]
|
|
repoList = pjs
|
|
for repo,src in datasets.values.tolist():
|
|
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
|
shellCallTemplate(cmd)
|
|
logging.info(repo)
|
|
collectBugFixPatches(repo)
|
|
|
|
def codeflaws():
|
|
cf = listdir(join(DATASET_PATH,'codeflaws'))
|
|
|
|
type = join(DATASET,'codeflaws')
|
|
folderDiff = join(type, 'DiffEntries')
|
|
folderPrev = join(type, 'prevFiles')
|
|
folderRev = join(type, 'revFiles')
|
|
if not os.path.exists(folderDiff):
|
|
os.makedirs(folderDiff)
|
|
|
|
if not os.path.exists(folderPrev):
|
|
os.makedirs(folderPrev)
|
|
|
|
if not os.path.exists(folderRev):
|
|
os.makedirs(folderRev)
|
|
cfBugs = [i for i in cf if os.path.isdir(join(DATASET_PATH,'codeflaws',i))]
|
|
for cfBug in cfBugs:
|
|
bugs = [i for i in listdir(join(DATASET_PATH,'codeflaws',cfBug)) if i.endswith('.c')]
|
|
bugs.sort()
|
|
if len(bugs) == 2:
|
|
s1 = bugs[0].replace('.c', '').split('-')
|
|
s2 = bugs[1].replace('.c', '').split('-')
|
|
prev = s1[-1]
|
|
rev = s2[-1]
|
|
bugName = '-'.join(s1[: -1])
|
|
shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[0]),join(folderPrev,"prev_"+bugName+"-"+prev+"-"+rev+'.c'))
|
|
shutil.copy(join(DATASET_PATH,'codeflaws',cfBug,bugs[1]),join(folderRev,bugName+"-"+prev+"-"+rev+'.c'))
|
|
cmd = 'diff -u ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[0]) + ' ' + join(DATASET_PATH,'codeflaws',cfBug,bugs[1])+ ' > ' + join(folderDiff,bugName+"-"+prev+"-"+rev+'.c.txt')
|
|
logging.info(cmd)
|
|
output, e = shellGitCheckout(cmd)
|
|
logging.info(output)
|
|
else:
|
|
print()
|
|
|