diff --git a/python/common/commons.py b/python/common/commons.py index 843008f..6b20bd7 100644 --- a/python/common/commons.py +++ b/python/common/commons.py @@ -1,6 +1,7 @@ import logging import sys import gzip +import traceback from typing import Union import numpy as np @@ -86,7 +87,7 @@ def setEnv(args): # cfg = yaml.load(ymlfile) with open(args.prop, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.safe_load(ymlfile) # for section in cfg: # print(section) # print(cfg['mysql']) @@ -202,11 +203,11 @@ def shellCallTemplate(cmd, enc='utf-8'): m = re.search('unknown revision or path not in the working tree', errors) if not m: raise CalledProcessError(errors, '-1') - output except CalledProcessError as e: - logging.error(errors) + print(f'Error while executing {cmd}\n> {errors}') + traceback.print_exc() except Exception as e: - logging.error(e) + traceback.print_exc() return output @@ -423,7 +424,7 @@ def parallelRunNo(coreFun, elements, *args): raise except Exception as e: - logging.error(e) + traceback.print_exc() executor.shutdown() raise @@ -448,7 +449,7 @@ def parallelRun(coreFun, elements, *args, max_workers=os.cpu_count()): logging.error('%r generated an exception: %s' % (url, exc)) raise except Exception as e: - logging.error(e) + traceback.print_exc() executor.shutdown() raise diff --git a/python/javaDS.py b/python/javaDS.py index 3a6b52f..4d034c1 100644 --- a/python/javaDS.py +++ b/python/javaDS.py @@ -1,13 +1,15 @@ +from datetime import date + from pandas import DataFrame from common.commons import * from commitCollector import * from settings import * -from otherDatasets import markBugFixingPatches +from otherDatasets import markBugFixingPatches, prepareFiles -DATASET_PATH = REPO_PATH +DATASET_PATH = Path(REPO_PATH) DATASET = os.environ["dataset"] PROJECT_LIST = os.environ["PROJECT_LIST"] @@ -28,9 +30,10 @@ def load_commits(repo: str, git_url: str, branch: str) -> DataFrame: return pd.read_pickle(commits_pickle) # Clone new commits - shellCallTemplate('git config --global http.postBuffer 157286400') - shellCallTemplate(f'git -C {DATASET_PATH} clone {git_url}') - logging.info(f'Git repo cloned: {repo}') + if not (DATASET_PATH / repo).exists(): + shellCallTemplate('git config --global http.postBuffer 157286400') + shellCallTemplate(f'git -C {DATASET_PATH} clone {git_url}') + logging.info(f'Git repo cloned: {repo}') commits = getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo), branch) commits = markBugFixingPatches(commits, repo) @@ -39,6 +42,10 @@ def load_commits(repo: str, git_url: str, branch: str) -> DataFrame: return commits +def filter_commits(commits: DataFrame, end_date: date) -> DataFrame: + return commits[commits.commitDate < end_date] + + def createDS(project_list: str = PROJECT_LIST): """ @@ -48,8 +55,7 @@ def createDS(project_list: str = PROJECT_LIST): pjList: list[str] = project_list.split(',') # Ensure directories exist - if not os.path.exists(DATASET_PATH): - os.mkdir(DATASET_PATH) + DATASET_PATH.mkdir(exist_ok=True) if not os.path.exists(COMMIT_DFS): os.mkdir(COMMIT_DFS) @@ -65,9 +71,10 @@ def createDS(project_list: str = PROJECT_LIST): logging.info(f'Processing {repo}') commits = load_commits(repo, src, branch) - commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))] - # keep only commits that are changing c files (.c) - commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.java') for i in x.keys()]))] + # keep only commits that has moves + commits = commits[[any(c == 'M' for c in dic.values()) for dic in commits.files]] + # keep only commits that are changing java files (.java) + commits = commits[[all(k.endswith('.java') for k in dic) for dic in commits.files]] # not a revert commit # commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))] # commits = commits[commits.files.apply(lambda x: len(x) == 1)] @@ -81,7 +88,6 @@ def createDS(project_list: str = PROJECT_LIST): commits = commits[commits.commit.isin(fixes)] print(len(commits)) # for s in a.commit.values.tolist(): - from otherDatasets import prepareFiles parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), repo) # # if job == 'clone':