from common.commons import * # CODE_PATH = os.environ["CODE_PATH"] DATA_PATH = os.environ["DATA_PATH"] BUG_REPORT_PATH = os.environ["BUG_REPORT"] COMMIT_DFS = os.environ["COMMIT_DFS"] import bs4 as bs import re def parseCore(br): columns = ['bugReport', 'summary', 'description','created','updated','resolved','reporterDN','reporterEmail','hasAttachment','attachmentTime','hasPR','commentsCount'] bugReport = pd.DataFrame(columns=columns) ind = 0 if isfile(join(BUG_REPORT_PATH, br)): with open(join(BUG_REPORT_PATH, br), 'rb') as f: the_page = p.load(f) soup = bs.BeautifulSoup(the_page, "html.parser") if not (br.startswith('show')): type = soup.find('span', {'id': 'type-val'}) status = soup.find('span', {'id': 'status-val'}) if type is None: # print(br) return; if (type.text.strip() == 'Bug' or type.text.strip() =='Defect'): if (status.text.strip() == 'Resolved' or status.text.strip() == 'Closed'): summary = soup.find('h1', {'id': 'summary-val'}).text desc = soup.find('div', {'id': 'description-val'}) created= soup.find('span', {'data-name': 'Created'}).time['datetime'] updated = soup.find('span', {'data-name': 'Updated'}).time['datetime'] resolved = soup.find('span', {'data-name': 'Resolved'}).time['datetime'] # jboss reporter is different reporterField = soup.find('span',{'id':'reporter-val'}) try: reporterInfo= reporterField.findAll('span', {'class': 'user-hover'})[0]['data-user'] reporterDict = eval(reporterInfo) reporterDN = reporterDict['displayName'] reporterEmail = reporterDict['emailAddress'] except KeyError as e: e reporterDN = reporterField.text.strip() reporterEmail = None except Exception as e: e reporterDN = reporterField.text.strip() reporterEmail = None attachment = soup.find('dd', {'class': 'attachment-date'}) isAttachment = False isPR = False attachmentTime = None if attachment is not None and len(attachment) > 0: attachmentTime = attachment.parent.time['datetime'] isAttachment = True hasPullReq = soup.find('strong', {'title': 'Git Pull Request'}) if hasPullReq is not None: haveAttachment = hasPullReq.parent.findAll('a') if(haveAttachment is not None and len(haveAttachment)>0): isPR = True else: linkPullRequest = soup.find('ul', {'id': 'issuedetails'}).find('a', {'title': 'PullRequest'}) if(linkPullRequest is not None and len(linkPullRequest)> 0): isPR = True commentsCount = len(re.findall(r'\"comment-[0-9]+', soup.getText())) if desc is not None: # bugReport.loc[ind] = [br, summary, desc.text,created,updated,resolved,reporterDN,reporterEmail,isAttachment,attachmentTime,isPR,commentsCount] return [br, summary, desc.text, created, updated, resolved, reporterDN, reporterEmail, isAttachment, attachmentTime, isPR, commentsCount] # ind += 1 else: # bugReport.loc[ind] = [br, summary, None, created, updated, resolved, reporterDN, reporterEmail, # isAttachment,attachmentTime,isPR,commentsCount] return [br, summary, None, created, updated, resolved, reporterDN, reporterEmail, isAttachment,attachmentTime,isPR,commentsCount] # ind += 1 # else: # print(type.text.strip()) else: importance = ( soup.find('a', {'href': 'page.cgi?id=fields.html#importance'}).parent.parent.parent.td.text.strip()) status = soup.find('span', {'id': 'static_bug_status'}).text.strip() m1 = re.search('enhancement', importance, re.IGNORECASE) # if not m: # print(br) if not m1: m = re.search('FIXED|DUPLICATE', status) if m: summary = soup.find('span', {'id': 'short_desc_nonedit_display'}).text comment0 = soup.find('div', {'id': 'c0'}) if comment0.find('a').text == 'Description': desc = comment0.find('pre', {'class': 'bz_comment_text'}).text bugReport.loc[ind] = [br, summary, desc] ind += 1 # desc # return bugReport def parallelRun(coreFun,elements,*args): with concurrent.futures.ProcessPoolExecutor() as executor: try: dataL = [] futures = {executor.submit(coreFun, l,*args): l for l in elements} for future in concurrent.futures.as_completed(futures): url = futures[future] try: data = future.result() dataL.append(data) except Exception as exc: logging.error('%r generated an exception: %s' % (url, exc)) raise kwargs = { 'total': len(futures), 'unit': 'files', 'unit_scale': True, 'leave': False } # Print out the progress as tasks complete for f in tqdm(concurrent.futures.as_completed(futures), **kwargs): pass newData = pd.concat(dataL) return newData except Exception as e: executor.shutdown() raise def getCommitter(x): subject = x.split('-')[0] subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) repo = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()[0] commits = load_zipped_pickle(join(COMMIT_DFS, repo+'.pickle')) commits committer = commits.query("fix =='{0}'".format(x)).committer.tolist() # print(len(committer)) return committer def step1(subject): if subject == 'ALL': bids = os.listdir(BUG_REPORT_PATH) else: bids = [f for f in os.listdir(BUG_REPORT_PATH) if f.startswith(subject)] bids = [i for i in bids if not i.startswith('.')] # bids = bids[:100] dataL = parallelRunMerge(parseCore,bids) logging.info('Finish parsing') # list(filter(None.__ne__, dataL)) br = pd.DataFrame( columns=['bugReport', 'summary', 'description', 'created', 'updated', 'resolved', 'reporterDN', 'reporterEmail', 'hasAttachment', 'attachmentTime', 'hasPR', 'commentsCount'], data=list(filter(None.__ne__, dataL))) # br = pd.concat(dataL) logging.info('Finish parsing') br['project'] = br.bugReport.apply(lambda x: x.split('-')[0]) br['bid'] = br.bugReport.apply(lambda x: x.split('.')[0]) br['committerEmail'] = br.bid.apply(lambda x: getCommitter(x)) br['sameEmail'] = br.apply(lambda x:x['reporterEmail'] in x['committerEmail'],axis=1) subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper()) save_zipped_pickle(br,join(DATA_PATH,subject+"bugReportsComplete.pickle")) def step3(subject): subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) subjects['Group']=subjects.Group.apply(lambda x:x.replace('Commons','Apache').replace('Wildfly','Jboss').upper()) stats = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle")) stats = stats[['bugReport', 'sameEmail', 'hasAttachment','attachmentTime','hasPR', 'created','codeElements','stackTraces','summaryHints','descHints','commentsCount']] stats['project'] =stats.bugReport.apply(lambda x:x.split('-')[0]) stats['project'] = stats.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Group.tolist()[0]+'-'+x) stats['created'] = stats['created'].apply(lambda x: pd.to_datetime(x)) stats['attachmentTime'] = stats['attachmentTime'].apply(lambda x: pd.to_datetime(x)) stats['isAttach'] = (stats['attachmentTime'] - stats['created']) < pd.Timedelta(1, unit='h') stats['hints'] = stats.apply(lambda x: True if len(x['summaryHints']) > 0 or len(x['descHints']) > 0 else False, axis=1) stats['commentsCount'].to_csv(subject+'Comments', index=False) a = stats.groupby('project').agg({'sameEmail': "sum", 'isAttach': "sum"}) a['avgComments'] =stats['commentsCount'].mean() a['hasStackTraces'] = len([i for i in stats['stackTraces'].str.len().tolist() if i != 0]) a['hasHints']= len([i for i in stats['hints'].tolist() if i == True]) a['hasCE']= len([i for i in stats['codeElements'].str.len().tolist() if i != 0]) a['no'] = len(stats) a.to_latex(join(DATA_PATH,subject+'datasetQuality.tex')) def caseBRParser(subject): step1(subject) # step2() # step3()