diff --git a/python/commitCollector.py b/python/commitCollector.py index 9e4338c..92c7d8f 100644 --- a/python/commitCollector.py +++ b/python/commitCollector.py @@ -31,57 +31,10 @@ def getCommitFromRepo(f: PathLike, gitrepo: str, branch: str): shellCallTemplate(f"git -C {f} log --no-merges --pretty=format:'{form}' > {file}", enc='latin1') # Collect commits - commits = json.loads(f'[{Path(file).read_text()}]') + content = Path(file).read_text().replace("\n", ",") + commits = json.loads(f'[{content}]') # Convert to DataFrame ds = pd.DataFrame.from_dict(commits) ds['commitDate'] = pd.to_datetime(ds['commitDate']) return ds - - -def caseCollect(subject): - if not os.path.exists(COMMIT_FOLDER): - os.mkdir(COMMIT_FOLDER) - if not os.path.exists(COMMIT_DFS): - os.mkdir(COMMIT_DFS) - - subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) - if subject == 'ALL': - tuples = subjects[['Repo', 'Branch']].values.tolist() - else: - # repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist() - tuples = subjects.query("Subject == '{0}'".format(subject))[ - ['Repo', 'Branch']].values.tolist() - - for t in tuples: - repo, branch = t - logging.info(repo) - getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo), branch) - - if subject == 'ALL': - commits = listdir(COMMIT_FOLDER) - else: - commits = [i for i in listdir(COMMIT_FOLDER) if i.startswith(repo)] - - for commit in commits: - logging.info(commit) - rDF = makeDF(join(COMMIT_FOLDER, commit)) - repoName = commit.split('.')[0] - save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle")) - # p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb")) - - -def caseClone(subject): - if not os.path.exists(REPO_PATH): - os.mkdir(REPO_PATH) - - subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) - if subject == 'ALL': - gitrepos = subjects.GitRepo.tolist() - else: - gitrepos = subjects.query("Subject == '{0}'".format(subject)).GitRepo.tolist() - os.getcwd() - os.chdir(REPO_PATH) - for gitrepo in gitrepos: - cmd = 'git clone ' + gitrepo - out = shellCallTemplate(cmd) diff --git a/python/javaDS.py b/python/javaDS.py index 4d034c1..23a0d54 100644 --- a/python/javaDS.py +++ b/python/javaDS.py @@ -89,71 +89,3 @@ def createDS(project_list: str = PROJECT_LIST): print(len(commits)) # for s in a.commit.values.tolist(): parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), repo) - - # # if job == 'clone': - # for repo,src in subjects[['Repo','GitRepo']].values.tolist(): - # if(pjList != ['ALL']): - # if repo in pjList: - # print(repo) - # cmd = 'git -C ' + DATASET_PATH + ' clone ' + src - # shellCallTemplate(cmd) - # logging.info(repo) - - # caseClone(subject) - - # caseCollect(subject) - # # elif job == 'fix': - # from filterBugFixingCommits import caseFix - # - # caseFix(subject) - # # - # # # elif job =='brDownload': - # from bugReportDownloader import caseBRDownload - # - # caseBRDownload(subject) - # # # elif job =='brParser': - # from bugReportParser import step1 - # - # step1(subject) - # - # # elif job =='dataset': - # - # if not isfile(join(DATA_PATH, 'singleBR.pickle')): - # - # brs = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle")) - # - # subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) - # - # - # def getCommit(x): - # bid, project = x - # - # subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) - # repo = subjects.query("Subject == '{0}'".format(project)).Repo.tolist()[0] - # commits = load_zipped_pickle(join(DATA_PATH, COMMIT_DFS, repo + '.pickle')) - # correspondingCommit = commits.query("fix =='{0}'".format(bid)).commit.tolist() - # if len(correspondingCommit) == 1: - # return [bid, correspondingCommit[0], project] - # else: - # return None - # print('error') - # - # - # wl = brs[['bid', 'project']].values.tolist() - # dataL = parallelRunMerge(getCommit, wl) - # - # commits = pd.DataFrame( - # columns=['bid', 'commit', 'project'], - # data=list(filter(None.__ne__, dataL))) - # - # save_zipped_pickle(commits, join(DATA_PATH, 'singleBR.pickle')) - # else: - # commits = load_zipped_pickle(join(DATA_PATH, 'singleBR.pickle')) - # subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv')) - # logging.info('done matching commits') - # commits['repo'] = commits.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Repo.tolist()[0]) - # - # workList = commits[['commit', 'repo']].values.tolist() - # from dataset import prepareFiles - # - # parallelRun(prepareFiles, workList)