Files
fixminer_source/python/linuxDataset.py
2020-04-06 21:30:39 +02:00

254 lines
8.6 KiB
Python

from common.commons import *
DATA_PATH = os.environ["DATA_PATH"]
COMMIT_DFS = os.environ["COMMIT_DFS"]
DATASET_PATH = '/Users/anil.koyuncu/projects/datasets'
repo = join(DATASET_PATH,'linux')
def filetype_fileter(filename):
# return filename.endswith(u'.java') and not bool(re.search('test.*\/', filename))
return filename.endswith(u'.c') or filename.endswith(u'.h')
def checkoutFiles(sha,shaOld, filePath,type, repo=None):
try:
# folderDiff = join(DATA_PATH, 'gumInput',repoName, 'DiffEntries')
folderDiff = join(DATA_PATH, type, 'DiffEntries')
folderPrev = join(DATA_PATH, type, 'prevFiles')
folderRev = join(DATA_PATH, type, 'revFiles')
if not os.path.exists(folderDiff):
os.mkdir(folderDiff)
if not os.path.exists(folderPrev):
os.mkdir(folderPrev)
if not os.path.exists(folderRev):
os.mkdir(folderRev)
# if repo is None:
# repo = join(REPO_PATH,repoName)
savePath = filePath.replace('/','#')
if not isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt'):
cmd = 'git -C ' + repo + ' diff -U ' + shaOld + ':' + filePath + '..' + sha + ':' + filePath # + '> ' + folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')
output,errors = shellGitCheckout(cmd,'latin1')
if errors:
# print(errors)
raise FileNotFoundError
regex = r"@@\s\-\d+,*\d*\s\+\d+,*\d*\s@@ ?(.*\n)*"
match = re.search(regex, output)
if not match:
return
# print()
not_matched, matched = output[:match.start()], match.group()
numberOfHunks = re.findall('@@\s\-\d+,*\d*\s\+\d+,*\d*\s@@', matched)
if len(numberOfHunks) == 0:
return
diffFile = shaOld + '\n' + matched.replace(' @@ ', ' @@\n')
with open(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath + '.txt',
'w') as writeFile:
writeFile.writelines(diffFile)
cmd = 'git -C ' + repo + ' show ' + sha + ':' + filePath + '> ' + folderRev + '/' + sha + '_' + shaOld + '_' +savePath
if errors:
# print(errors)
raise FileNotFoundError
o,errors= shellGitCheckout(cmd,'latin1')
cmd = 'git -C ' + repo + ' show ' + shaOld + ':' + filePath + '> ' + folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath
if errors:
# print(errors)
raise FileNotFoundError
o,errors = shellGitCheckout(cmd,'latin1')
if errors:
# print(errors)
raise FileNotFoundError
except FileNotFoundError as fnfe:
if isfile(folderRev + '/' + sha + '_' + shaOld + '_' +savePath):
os.remove(folderRev + '/' + sha + '_' + shaOld + '_' +savePath)
if isfile(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath):
os.remove(folderPrev + '/' + 'prev_'+sha + '_' + shaOld + '_' +savePath)
if isfile(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt')):
os.remove(folderDiff + '/' + sha + '_' + shaOld + '_' + savePath.replace('.java','.txt'))
# print(fnfe)
# raise Exception(fnfe)
except Exception as e:
# print(e)
raise Exception(e)
def prepareFiles(t):
try:
sha,files = t
shaOld = sha + '^'
# repo = join(REPO_PATH,repoName)
gumInputRepo = join(DATA_PATH, 'gumInputLinux')
if not os.path.exists(join(gumInputRepo)):
os.makedirs(gumInputRepo)
# cmd = 'git -C ' + repo + ' diff --name-only ' + shaOld + '..'+sha
#
# output, errors = shellGitCheckout(cmd, 'latin1')
# files = output.strip().split('\n')
# if len(nonJava) > 0:
# logging.warning('Skipping commit %s',sha)
# return
# if len(files) != 1:
# return
# nonTest = [f for f in files if not re.search('test', f, re.I) and f.endswith('.c') or f.endswith(u'.h')]
#
# if len(nonTest) > 1:
# return
nonTest = []
for k,v in files.items():
if v == 'M':
if k.endswith('.c') or k.endswith(u'.h'):
nonTest.append(k)
# nonTest = [f for f in files.keys() if f.endswith('.c') or f.endswith(u'.h')]
cmd = 'git -C ' + repo + ' rev-parse --short=6 ' + shaOld
output, errors = shellGitCheckout(cmd, 'latin1')
shaOld = output.strip()
cmd = 'git -C ' + repo + ' rev-parse --short=6 ' + sha
output, errors = shellGitCheckout(cmd, 'latin1')
sha = output.strip()
if isinstance(nonTest, list):
for file in nonTest:
checkoutFiles(sha,shaOld, file,'gumInputLinux',repo)
except Exception as e:
print(e)
def checkCommitLog(x):
cmd= 'git -C ' + repo + ' show ' + x + " --pretty=\"format:\" --name-status -M100%"
out, err = shellGitCheckout(cmd, enc='latin1')
log = {}
lines = out.strip().split('\n')
for line in lines:
fname = line[2:].strip()
ftype = line[:1]
log[fname] = ftype
log
return log
def getCommitLog(x):
# commit, repo = x
cmd = 'git -C ' + repo + '/ ' + " show --pretty=format:'%B' --no-patch " + x
output = shellCallTemplate(cmd, 'latin-1')
matches = re.finditer('Fixes: [0-9a-f]{6,40}', output)
match = list(matches)
fixes = []
if len(match) >= 1:
for m in match:
fixes.append(m.group())
# return match[0].group()
matches = re.finditer('http[s]?:\/\/.*show_bug\.cgi\?id=[0-9]*',output)
match = list(matches)
links = []
if len(match) >= 1:
for m in match:
links.append(m.group())
df = pd.DataFrame(data=[[links,fixes, output,x]], columns=['links','fixes','log','commit'])
# df = df.T
# df.columns = ['log', 'commit']
return df
output
def collectBugFixPatches():
commits = getAllCommits()
# remove commits that are only deleting or adding files
commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))]
# keep only commits that are changing c files (.c)
commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.c') for i in x.keys()]))]
#not a revert commit
# commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))]
# commits = commits[commits.files.apply(lambda x: len(x) == 1)]
# commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False)
# coccis = commits[commits.cocci].commit.values.tolist()
fixes = commits[commits.fixes.str.len()!=0].commit.values.tolist()
# links = commits[commits.links.str.len()!=0].commit.values.tolist()
# bugs = set(fixes).union(links).union(coccis)
# bugs = set(fixes).union(coccis)
bugs = set(fixes)
a = commits[commits.commit.isin(bugs)]
# for s in a.commit.values.tolist():
print(len(a))
parallelRun(prepareFiles,a[['commit','files']].values.tolist())
# prepareFiles(s)
def markBugFixingPatches(commits):
# from pandarallel import pandarallel
#
# pandarallel.initialize()
# commits['files'] = commits.commit.parallel_apply(checkCommitLog)
# commits
# commits['isC'] = commits.files.apply(lambda x:np.any([i.endswith('.c') or i.endswith('.h') for i in x.keys() ]))
commits = commits[commits.isC == True]
# commits.commit.parallel_apply(getCommitLog)
f = parallelRunMergeNew(getCommitLog, commits['commit'].values.tolist())
res = pd.merge(commits, f, on=['commit'])
save_zipped_pickle(res, join(COMMIT_DFS, 'linuxFix' + ".pickle"))
def getAllCommits():
if isfile(join(COMMIT_DFS,'linuxFix.pickle')):
return load_zipped_pickle(join(COMMIT_DFS,'linuxFix.pickle'))
else:
if isfile(join(COMMIT_DFS,'linux.pickle')):
commits = load_zipped_pickle(join(COMMIT_DFS,'linux.pickle'))
else:
if not os.path.exists(COMMIT_DFS):
os.mkdir(COMMIT_DFS)
cmd = 'git -C ' + repo + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + join(DATA_PATH,'linux' + '.commits')
output = shellCallTemplate(cmd, enc='latin1')
from commitCollector import makeDF
rDF = makeDF(join(DATA_PATH,'linux' + '.commits'))
save_zipped_pickle(rDF, join(COMMIT_DFS, 'linux' + ".pickle"))
return rDF
markBugFixingPatches(commits)