Files
fixminer_source/python/filterBugFixingCommits.py
2020-04-06 21:30:39 +02:00

157 lines
5.6 KiB
Python

from common.commons import *
ROOT_DIR = os.environ["ROOT_DIR"]
REPO_PATH = os.environ["REPO_PATH"]
DATA_PATH = os.environ["DATA_PATH"]
COMMIT_DFS = os.environ["COMMIT_DFS"]
BUG_POINT = os.environ["BUG_POINT"]
COMMIT_FOLDER = os.environ["COMMIT_FOLDER"]
def getLast(bugID):
if isfile(join(BUG_POINT, bugID + ".pickle")):
return
else:
subject = bugID.split('-')[0]
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
repo = subjects.query("Subject == '{0}'".format(subject)).iloc[0].Repo
sourceDF = load_zipped_pickle(join(COMMIT_DFS, repo + '.pickle'))
sourceDF['fix'] = sourceDF['fix'].apply(lambda x: x.strip() if not x is None else x)
aDf = sourceDF[sourceDF.fix == bugID]
if len(aDf > 0):
dateCheck = aDf.sort_values('commitDate').iloc[0].commitDate
filtered = sourceDF.query("commitDate < '{0}'".format(dateCheck ))
filtered['dateCheck'] = dateCheck
filtered = filtered.head(1)
save_zipped_pickle(filtered,join(BUG_POINT,bugID + ".pickle"))
def markFix(subject,repoName):
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
branch = subjects.query("Subject == '{0}'".format(subject))['Branch'].values.tolist()[0]
cmd = 'git -C ' + join(REPO_PATH, repoName) + ' checkout -f '+branch
output, err = shellGitCheckout(cmd,enc='latin1')
m = re.search(branch, err)
while not m:
time.sleep(10)
logging.info('Waiting for checkout')
aDF = load_zipped_pickle(join(COMMIT_DFS, repoName + '.pickle'))
aDF['fix'] = aDF['commit'].apply(lambda x: getBugIds(x, subject, repoName))
return aDF
def getBugIds(x,subject,repoName):
cmd = 'git -C ' + REPO_PATH+repoName + ' show --quiet ' + x
output = shellCallTemplate(cmd)
pattern = r"[\s:\[\(#\-/]("+subject+"\-[0-9]+)"
match = re.search(pattern, output, re.IGNORECASE)
if not match:
return None
if (len(match.groups()) == 1):
matched = match.group(1)
else:
logging.error('too many match groups')
matched
return matched
# def getLasts(subjec,predict = False):
# if not os.path.exists(BUG_POINT):
# os.mkdir(BUG_POINT)
#
# if predict:
# bugIDS = load_zipped_pickle(join(CODE_PATH, subjec+'BugReportsExport.pickle'))
# logging.info("Extracting bug points for prediction")
# selectedIds = bugIDS.bugID.unique().tolist()
# else:
# subjects = pd.read_csv(join(CODE_PATH, 'subjects.csv'))
# repo = subjects.query("Subject == '{0}'".format(subjec)).iloc[0].Repo
#
# sourceDF = load_zipped_pickle(join(COMMIT_DFS, repo + '.pickle'))
#
# selectedIds = sourceDF.fix.unique().tolist()
# selectedIds = [i for i in selectedIds if i is not None]
# if subjec != 'ALL':
# selectedIds = [i for i in selectedIds if i.startswith(subjec)]
#
# with concurrent.futures.ProcessPoolExecutor() as executor:
# try:
# futures = {executor.submit(getLast, bugID ): bugID for bugID in selectedIds }
# for future in concurrent.futures.as_completed(futures):
# url = futures[future]
# try:
# data = future.result()
#
# except Exception as exc:
# logging.error('%r generated an exception: %s' % (url, exc))
# raise
# kwargs = {
# 'total': len(futures),
# 'unit': 'files',
# 'unit_scale': True,
# 'leave': False
# }
# # Print out the progress as tasks complete
# for f in tqdm(concurrent.futures.as_completed(futures), **kwargs):
# pass
# except Exception as e:
# logging.error(e)
# executor.shutdown()
# raise
def caseFix(subjec):
cmd = 'git config --global diff.renamelimit 0'
shellCallTemplate(cmd)
logging.info("Marking fixes")
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
if subjec != 'ALL':
subjects = subjects.query("Subject == '{0}'".format(subjec))
with concurrent.futures.ProcessPoolExecutor() as executor:
try:
futures = {
executor.submit(markFix, subject, subjects.query("Subject == '{0}'".format(subject)).iloc[0].Repo): subject
for subject in subjects.Subject.tolist()}
for future in concurrent.futures.as_completed(futures):
url = futures[future]
try:
data = future.result()
data = data[~data.fix.isna()]
data.fix = data.fix.apply(lambda x: x.strip().upper())
singleFix = data.fix.value_counts().loc[lambda x: x == 1].reset_index(name='count')['index']
singleCommits = data[data.fix.isin(singleFix)]
save_zipped_pickle(singleCommits,join(COMMIT_DFS, subjects.query("Subject == '{0}'".format(url)).iloc[0].Repo + ".pickle"))
except Exception as exc:
logging.error('%r generated an exception: %s' % (url, exc))
raise
kwargs = {
'total': len(futures),
'unit': 'subject',
'unit_scale': True,
'leave': False
}
# Print out the progress as tasks complete
for f in tqdm(concurrent.futures.as_completed(futures), **kwargs):
pass
except Exception as e:
# logging.error(e)
executor.shutdown()
raise