210 lines
9.5 KiB
Python
210 lines
9.5 KiB
Python
from common.commons import *
|
|
|
|
# CODE_PATH = os.environ["CODE_PATH"]
|
|
DATA_PATH = os.environ["DATA_PATH"]
|
|
BUG_REPORT_PATH = os.environ["BUG_REPORT"]
|
|
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
|
|
|
import bs4 as bs
|
|
|
|
import re
|
|
def parseCore(br):
|
|
columns = ['bugReport', 'summary', 'description','created','updated','resolved','reporterDN','reporterEmail','hasAttachment','attachmentTime','hasPR','commentsCount']
|
|
bugReport = pd.DataFrame(columns=columns)
|
|
ind = 0
|
|
if isfile(join(BUG_REPORT_PATH, br)):
|
|
|
|
with open(join(BUG_REPORT_PATH, br), 'rb') as f:
|
|
the_page = p.load(f)
|
|
soup = bs.BeautifulSoup(the_page, "html.parser")
|
|
|
|
if not (br.startswith('show')):
|
|
|
|
type = soup.find('span', {'id': 'type-val'})
|
|
status = soup.find('span', {'id': 'status-val'})
|
|
|
|
if type is None:
|
|
# print(br)
|
|
return;
|
|
if (type.text.strip() == 'Bug' or type.text.strip() =='Defect'):
|
|
if (status.text.strip() == 'Resolved' or status.text.strip() == 'Closed'):
|
|
|
|
summary = soup.find('h1', {'id': 'summary-val'}).text
|
|
desc = soup.find('div', {'id': 'description-val'})
|
|
|
|
created= soup.find('span', {'data-name': 'Created'}).time['datetime']
|
|
updated = soup.find('span', {'data-name': 'Updated'}).time['datetime']
|
|
resolved = soup.find('span', {'data-name': 'Resolved'}).time['datetime']
|
|
|
|
# jboss reporter is different
|
|
reporterField = soup.find('span',{'id':'reporter-val'})
|
|
try:
|
|
reporterInfo= reporterField.findAll('span', {'class': 'user-hover'})[0]['data-user']
|
|
reporterDict = eval(reporterInfo)
|
|
reporterDN = reporterDict['displayName']
|
|
reporterEmail = reporterDict['emailAddress']
|
|
except KeyError as e:
|
|
|
|
e
|
|
reporterDN = reporterField.text.strip()
|
|
reporterEmail = None
|
|
except Exception as e:
|
|
e
|
|
reporterDN = reporterField.text.strip()
|
|
reporterEmail = None
|
|
|
|
attachment = soup.find('dd', {'class': 'attachment-date'})
|
|
isAttachment = False
|
|
isPR = False
|
|
attachmentTime = None
|
|
if attachment is not None and len(attachment) > 0:
|
|
attachmentTime = attachment.parent.time['datetime']
|
|
isAttachment = True
|
|
|
|
hasPullReq = soup.find('strong', {'title': 'Git Pull Request'})
|
|
if hasPullReq is not None:
|
|
haveAttachment = hasPullReq.parent.findAll('a')
|
|
if(haveAttachment is not None and len(haveAttachment)>0):
|
|
isPR = True
|
|
else:
|
|
linkPullRequest = soup.find('ul', {'id': 'issuedetails'}).find('a', {'title': 'PullRequest'})
|
|
if(linkPullRequest is not None and len(linkPullRequest)> 0):
|
|
isPR = True
|
|
|
|
commentsCount = len(re.findall(r'\"comment-[0-9]+', soup.getText()))
|
|
if desc is not None:
|
|
# bugReport.loc[ind] = [br, summary, desc.text,created,updated,resolved,reporterDN,reporterEmail,isAttachment,attachmentTime,isPR,commentsCount]
|
|
return [br, summary, desc.text, created, updated, resolved, reporterDN,
|
|
reporterEmail, isAttachment, attachmentTime, isPR, commentsCount]
|
|
# ind += 1
|
|
else:
|
|
# bugReport.loc[ind] = [br, summary, None, created, updated, resolved, reporterDN, reporterEmail,
|
|
# isAttachment,attachmentTime,isPR,commentsCount]
|
|
return [br, summary, None, created, updated, resolved, reporterDN, reporterEmail,
|
|
isAttachment,attachmentTime,isPR,commentsCount]
|
|
# ind += 1
|
|
# else:
|
|
# print(type.text.strip())
|
|
else:
|
|
importance = (
|
|
soup.find('a', {'href': 'page.cgi?id=fields.html#importance'}).parent.parent.parent.td.text.strip())
|
|
status = soup.find('span', {'id': 'static_bug_status'}).text.strip()
|
|
|
|
m1 = re.search('enhancement', importance, re.IGNORECASE)
|
|
# if not m:
|
|
# print(br)
|
|
if not m1:
|
|
m = re.search('FIXED|DUPLICATE', status)
|
|
if m:
|
|
summary = soup.find('span', {'id': 'short_desc_nonedit_display'}).text
|
|
comment0 = soup.find('div', {'id': 'c0'})
|
|
if comment0.find('a').text == 'Description':
|
|
desc = comment0.find('pre', {'class': 'bz_comment_text'}).text
|
|
bugReport.loc[ind] = [br, summary, desc]
|
|
ind += 1
|
|
# desc
|
|
# return bugReport
|
|
|
|
def parallelRun(coreFun,elements,*args):
|
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
|
try:
|
|
dataL = []
|
|
futures = {executor.submit(coreFun, l,*args): l for l in elements}
|
|
for future in concurrent.futures.as_completed(futures):
|
|
url = futures[future]
|
|
try:
|
|
data = future.result()
|
|
dataL.append(data)
|
|
|
|
except Exception as exc:
|
|
logging.error('%r generated an exception: %s' % (url, exc))
|
|
raise
|
|
|
|
kwargs = {
|
|
'total': len(futures),
|
|
'unit': 'files',
|
|
'unit_scale': True,
|
|
'leave': False
|
|
}
|
|
# Print out the progress as tasks complete
|
|
for f in tqdm(concurrent.futures.as_completed(futures), **kwargs):
|
|
pass
|
|
newData = pd.concat(dataL)
|
|
return newData
|
|
except Exception as e:
|
|
executor.shutdown()
|
|
raise
|
|
|
|
def getCommitter(x):
|
|
subject = x.split('-')[0]
|
|
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
|
repo = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()[0]
|
|
commits = load_zipped_pickle(join(COMMIT_DFS, repo+'.pickle'))
|
|
commits
|
|
committer = commits.query("fix =='{0}'".format(x)).committer.tolist()
|
|
# print(len(committer))
|
|
return committer
|
|
|
|
|
|
|
|
def step1(subject):
|
|
|
|
if subject == 'ALL':
|
|
bids = os.listdir(BUG_REPORT_PATH)
|
|
else:
|
|
bids = [f for f in os.listdir(BUG_REPORT_PATH) if f.startswith(subject)]
|
|
bids = [i for i in bids if not i.startswith('.')]
|
|
|
|
# bids = bids[:100]
|
|
|
|
|
|
|
|
dataL = parallelRunMerge(parseCore,bids)
|
|
logging.info('Finish parsing')
|
|
# list(filter(None.__ne__, dataL))
|
|
br = pd.DataFrame(
|
|
columns=['bugReport', 'summary', 'description', 'created', 'updated', 'resolved', 'reporterDN', 'reporterEmail',
|
|
'hasAttachment', 'attachmentTime', 'hasPR', 'commentsCount'], data=list(filter(None.__ne__, dataL)))
|
|
# br = pd.concat(dataL)
|
|
logging.info('Finish parsing')
|
|
br['project'] = br.bugReport.apply(lambda x: x.split('-')[0])
|
|
br['bid'] = br.bugReport.apply(lambda x: x.split('.')[0])
|
|
br['committerEmail'] = br.bid.apply(lambda x: getCommitter(x))
|
|
|
|
br['sameEmail'] = br.apply(lambda x:x['reporterEmail'] in x['committerEmail'],axis=1)
|
|
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
|
subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper())
|
|
save_zipped_pickle(br,join(DATA_PATH,subject+"bugReportsComplete.pickle"))
|
|
|
|
|
|
def step3(subject):
|
|
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
|
subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper())
|
|
|
|
stats = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle"))
|
|
|
|
stats = stats[['bugReport', 'sameEmail', 'hasAttachment','attachmentTime','hasPR', 'created','codeElements','stackTraces','summaryHints','descHints','commentsCount']]
|
|
stats['project'] =stats.bugReport.apply(lambda x:x.split('-')[0])
|
|
stats['project'] = stats.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Group.tolist()[0]+'-'+x)
|
|
stats['created'] = stats['created'].apply(lambda x: pd.to_datetime(x))
|
|
stats['attachmentTime'] = stats['attachmentTime'].apply(lambda x: pd.to_datetime(x))
|
|
stats['isAttach'] = (stats['attachmentTime'] - stats['created']) < pd.Timedelta(1, unit='h')
|
|
stats['hints'] = stats.apply(lambda x: True if len(x['summaryHints']) > 0 or len(x['descHints']) > 0 else False,
|
|
axis=1)
|
|
|
|
stats['commentsCount'].to_csv(subject+'Comments', index=False)
|
|
|
|
a = stats.groupby('project').agg({'sameEmail': "sum", 'isAttach': "sum"})
|
|
a['avgComments'] =stats['commentsCount'].mean()
|
|
a['hasStackTraces'] = len([i for i in stats['stackTraces'].str.len().tolist() if i != 0])
|
|
a['hasHints']= len([i for i in stats['hints'].tolist() if i == True])
|
|
a['hasCE']= len([i for i in stats['codeElements'].str.len().tolist() if i != 0])
|
|
|
|
a['no'] = len(stats)
|
|
|
|
a.to_latex(join(DATA_PATH,subject+'datasetQuality.tex'))
|
|
|
|
def caseBRParser(subject):
|
|
step1(subject)
|
|
# step2()
|
|
# step3() |