merge python scripts
This commit is contained in:
@@ -0,0 +1,210 @@
|
||||
from common.commons import *
|
||||
|
||||
# CODE_PATH = os.environ["CODE_PATH"]
|
||||
DATA_PATH = os.environ["DATA_PATH"]
|
||||
BUG_REPORT_PATH = os.environ["BUG_REPORT"]
|
||||
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
||||
|
||||
import bs4 as bs
|
||||
|
||||
import re
|
||||
def parseCore(br):
|
||||
columns = ['bugReport', 'summary', 'description','created','updated','resolved','reporterDN','reporterEmail','hasAttachment','attachmentTime','hasPR','commentsCount']
|
||||
bugReport = pd.DataFrame(columns=columns)
|
||||
ind = 0
|
||||
if isfile(join(BUG_REPORT_PATH, br)):
|
||||
|
||||
with open(join(BUG_REPORT_PATH, br), 'rb') as f:
|
||||
the_page = p.load(f)
|
||||
soup = bs.BeautifulSoup(the_page, "html.parser")
|
||||
|
||||
if not (br.startswith('show')):
|
||||
|
||||
type = soup.find('span', {'id': 'type-val'})
|
||||
status = soup.find('span', {'id': 'status-val'})
|
||||
|
||||
if type is None:
|
||||
# print(br)
|
||||
return;
|
||||
if (type.text.strip() == 'Bug' or type.text.strip() =='Defect'):
|
||||
if (status.text.strip() == 'Resolved' or status.text.strip() == 'Closed'):
|
||||
|
||||
summary = soup.find('h1', {'id': 'summary-val'}).text
|
||||
desc = soup.find('div', {'id': 'description-val'})
|
||||
|
||||
created= soup.find('span', {'data-name': 'Created'}).time['datetime']
|
||||
updated = soup.find('span', {'data-name': 'Updated'}).time['datetime']
|
||||
resolved = soup.find('span', {'data-name': 'Resolved'}).time['datetime']
|
||||
|
||||
# jboss reporter is different
|
||||
reporterField = soup.find('span',{'id':'reporter-val'})
|
||||
try:
|
||||
reporterInfo= reporterField.findAll('span', {'class': 'user-hover'})[0]['data-user']
|
||||
reporterDict = eval(reporterInfo)
|
||||
reporterDN = reporterDict['displayName']
|
||||
reporterEmail = reporterDict['emailAddress']
|
||||
except KeyError as e:
|
||||
|
||||
e
|
||||
reporterDN = reporterField.text.strip()
|
||||
reporterEmail = None
|
||||
except Exception as e:
|
||||
e
|
||||
reporterDN = reporterField.text.strip()
|
||||
reporterEmail = None
|
||||
|
||||
attachment = soup.find('dd', {'class': 'attachment-date'})
|
||||
isAttachment = False
|
||||
isPR = False
|
||||
attachmentTime = None
|
||||
if attachment is not None and len(attachment) > 0:
|
||||
attachmentTime = attachment.parent.time['datetime']
|
||||
isAttachment = True
|
||||
|
||||
hasPullReq = soup.find('strong', {'title': 'Git Pull Request'})
|
||||
if hasPullReq is not None:
|
||||
haveAttachment = hasPullReq.parent.findAll('a')
|
||||
if(haveAttachment is not None and len(haveAttachment)>0):
|
||||
isPR = True
|
||||
else:
|
||||
linkPullRequest = soup.find('ul', {'id': 'issuedetails'}).find('a', {'title': 'PullRequest'})
|
||||
if(linkPullRequest is not None and len(linkPullRequest)> 0):
|
||||
isPR = True
|
||||
|
||||
commentsCount = len(re.findall(r'\"comment-[0-9]+', soup.getText()))
|
||||
if desc is not None:
|
||||
# bugReport.loc[ind] = [br, summary, desc.text,created,updated,resolved,reporterDN,reporterEmail,isAttachment,attachmentTime,isPR,commentsCount]
|
||||
return [br, summary, desc.text, created, updated, resolved, reporterDN,
|
||||
reporterEmail, isAttachment, attachmentTime, isPR, commentsCount]
|
||||
# ind += 1
|
||||
else:
|
||||
# bugReport.loc[ind] = [br, summary, None, created, updated, resolved, reporterDN, reporterEmail,
|
||||
# isAttachment,attachmentTime,isPR,commentsCount]
|
||||
return [br, summary, None, created, updated, resolved, reporterDN, reporterEmail,
|
||||
isAttachment,attachmentTime,isPR,commentsCount]
|
||||
# ind += 1
|
||||
# else:
|
||||
# print(type.text.strip())
|
||||
else:
|
||||
importance = (
|
||||
soup.find('a', {'href': 'page.cgi?id=fields.html#importance'}).parent.parent.parent.td.text.strip())
|
||||
status = soup.find('span', {'id': 'static_bug_status'}).text.strip()
|
||||
|
||||
m1 = re.search('enhancement', importance, re.IGNORECASE)
|
||||
# if not m:
|
||||
# print(br)
|
||||
if not m1:
|
||||
m = re.search('FIXED|DUPLICATE', status)
|
||||
if m:
|
||||
summary = soup.find('span', {'id': 'short_desc_nonedit_display'}).text
|
||||
comment0 = soup.find('div', {'id': 'c0'})
|
||||
if comment0.find('a').text == 'Description':
|
||||
desc = comment0.find('pre', {'class': 'bz_comment_text'}).text
|
||||
bugReport.loc[ind] = [br, summary, desc]
|
||||
ind += 1
|
||||
# desc
|
||||
# return bugReport
|
||||
|
||||
def parallelRun(coreFun,elements,*args):
|
||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
try:
|
||||
dataL = []
|
||||
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
url = futures[future]
|
||||
try:
|
||||
data = future.result()
|
||||
dataL.append(data)
|
||||
|
||||
except Exception as exc:
|
||||
logging.error('%r generated an exception: %s' % (url, exc))
|
||||
raise
|
||||
|
||||
kwargs = {
|
||||
'total': len(futures),
|
||||
'unit': 'files',
|
||||
'unit_scale': True,
|
||||
'leave': False
|
||||
}
|
||||
# Print out the progress as tasks complete
|
||||
for f in tqdm(concurrent.futures.as_completed(futures), **kwargs):
|
||||
pass
|
||||
newData = pd.concat(dataL)
|
||||
return newData
|
||||
except Exception as e:
|
||||
executor.shutdown()
|
||||
raise
|
||||
|
||||
def getCommitter(x):
|
||||
subject = x.split('-')[0]
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
repo = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()[0]
|
||||
commits = load_zipped_pickle(join(COMMIT_DFS, repo+'.pickle'))
|
||||
commits
|
||||
committer = commits.query("fix =='{0}'".format(x)).committer.tolist()
|
||||
# print(len(committer))
|
||||
return committer
|
||||
|
||||
|
||||
|
||||
def step1(subject):
|
||||
|
||||
if subject == 'ALL':
|
||||
bids = os.listdir(BUG_REPORT_PATH)
|
||||
else:
|
||||
bids = [f for f in os.listdir(BUG_REPORT_PATH) if f.startswith(subject)]
|
||||
bids = [i for i in bids if not i.startswith('.')]
|
||||
|
||||
# bids = bids[:100]
|
||||
|
||||
|
||||
|
||||
dataL = parallelRunMerge(parseCore,bids)
|
||||
logging.info('Finish parsing')
|
||||
# list(filter(None.__ne__, dataL))
|
||||
br = pd.DataFrame(
|
||||
columns=['bugReport', 'summary', 'description', 'created', 'updated', 'resolved', 'reporterDN', 'reporterEmail',
|
||||
'hasAttachment', 'attachmentTime', 'hasPR', 'commentsCount'], data=list(filter(None.__ne__, dataL)))
|
||||
# br = pd.concat(dataL)
|
||||
logging.info('Finish parsing')
|
||||
br['project'] = br.bugReport.apply(lambda x: x.split('-')[0])
|
||||
br['bid'] = br.bugReport.apply(lambda x: x.split('.')[0])
|
||||
br['committerEmail'] = br.bid.apply(lambda x: getCommitter(x))
|
||||
|
||||
br['sameEmail'] = br.apply(lambda x:x['reporterEmail'] in x['committerEmail'],axis=1)
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper())
|
||||
save_zipped_pickle(br,join(DATA_PATH,subject+"bugReportsComplete.pickle"))
|
||||
|
||||
|
||||
def step3(subject):
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper())
|
||||
|
||||
stats = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle"))
|
||||
|
||||
stats = stats[['bugReport', 'sameEmail', 'hasAttachment','attachmentTime','hasPR', 'created','codeElements','stackTraces','summaryHints','descHints','commentsCount']]
|
||||
stats['project'] =stats.bugReport.apply(lambda x:x.split('-')[0])
|
||||
stats['project'] = stats.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Group.tolist()[0]+'-'+x)
|
||||
stats['created'] = stats['created'].apply(lambda x: pd.to_datetime(x))
|
||||
stats['attachmentTime'] = stats['attachmentTime'].apply(lambda x: pd.to_datetime(x))
|
||||
stats['isAttach'] = (stats['attachmentTime'] - stats['created']) < pd.Timedelta(1, unit='h')
|
||||
stats['hints'] = stats.apply(lambda x: True if len(x['summaryHints']) > 0 or len(x['descHints']) > 0 else False,
|
||||
axis=1)
|
||||
|
||||
stats['commentsCount'].to_csv(subject+'Comments', index=False)
|
||||
|
||||
a = stats.groupby('project').agg({'sameEmail': "sum", 'isAttach': "sum"})
|
||||
a['avgComments'] =stats['commentsCount'].mean()
|
||||
a['hasStackTraces'] = len([i for i in stats['stackTraces'].str.len().tolist() if i != 0])
|
||||
a['hasHints']= len([i for i in stats['hints'].tolist() if i == True])
|
||||
a['hasCE']= len([i for i in stats['codeElements'].str.len().tolist() if i != 0])
|
||||
|
||||
a['no'] = len(stats)
|
||||
|
||||
a.to_latex(join(DATA_PATH,subject+'datasetQuality.tex'))
|
||||
|
||||
def caseBRParser(subject):
|
||||
step1(subject)
|
||||
# step2()
|
||||
# step3()
|
||||
Reference in New Issue
Block a user