diff --git a/python/common/commons.py b/python/common/commons.py index 9cf3c23..4e8a72d 100644 --- a/python/common/commons.py +++ b/python/common/commons.py @@ -148,8 +148,9 @@ def shellCallTemplate(cmd, enc='utf-8'): logging.info(cmd) with Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, encoding=enc) as p: out, err = p.communicate() + ret = p.returncode # print(output) - if err: + if ret != 0 and err: m = re.search('unknown revision or path not in the working tree', err) if not m: raise CalledProcessError(err, '-1') diff --git a/python/dataset4j.py b/python/dataset4j.py index 5e0168f..334cc01 100644 --- a/python/dataset4j.py +++ b/python/dataset4j.py @@ -44,21 +44,18 @@ def load_commits(repo: str, git_url: str, branch: str) -> DataFrame: return commits -def filter_commits(commits: DataFrame, end_date: date) -> DataFrame: - return commits[commits.commitDate < end_date] - - -def create_dataset(project_list: str = PROJECT_LIST): +def create_dataset(cfg: dict, project_list: str = PROJECT_LIST): """ Create dataset + :param cfg: config.yml dictionary :param project_list: Comma-separated list of git project names (projects must exist in dataset.csv) :return: """ pj_list: list[str] = project_list.split(',') # Ensure directories exist - DATASET_PATH.mkdir(exist_ok=True) + DATASET_PATH.mkdir(exist_ok=True, parents=True) if not os.path.exists(COMMIT_DFS): os.mkdir(COMMIT_DFS) @@ -85,10 +82,23 @@ def create_dataset(project_list: str = PROJECT_LIST): # commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False) # coccis = commits[commits.cocci].commit.values.tolist() fixes = commits[commits.fixes.str.len() != 0].commit.values.tolist() - # links = commits[commits.links.str.len()!=0].commit.values.tolist() - # bugs = set(fixes).union(links).union(coccis) - # bugs = set(fixes)#.union(coccis) + # Filter end dates if configured + if 'limitCommitsBeforeDays' in cfg['fixminer']: + value = eval(str(cfg['fixminer']['limitCommitsBeforeDays'])) + latest_commit = commits.commitDate.iloc[0] + + if isinstance(value, datetime.timedelta): + end_date = latest_commit - value + elif isinstance(value, float) or isinstance(value, int): + end_date = latest_commit - datetime.timedelta(days=value) + else: + raise NotImplementedError(f'Unknown limitCommitsBeforeDays type: {type(value)}. ' + f'Only timedelta and int/float (days) are supported.') + + print(f'> Has {len(commits)} comments before filtering for date < {end_date}') + commits = commits[commits.commitDate < end_date] + commits = commits[commits.commit.isin(fixes)] print(f'> Has {len(commits)} comments after filtering') diff --git a/python/main.py b/python/main.py index 699ae55..2b8a7d0 100644 --- a/python/main.py +++ b/python/main.py @@ -35,7 +35,7 @@ def job_dataset4c(): def job_richedit(): dbDir = join(DATA_PATH, 'redis') stopDB(dbDir, REDIS_PORT) - cmd = f"JAVA_HOME='{jdk8}' java -jar {JAR_PATH} {args.prop} RICHEDITSCRIPT " + cmd = f"JAVA_HOME='{jdk8}' java -jar '{JAR_PATH}' {args.prop} RICHEDITSCRIPT " output = shellCallTemplate(cmd) logging.info(output) @@ -53,7 +53,7 @@ def job_compare(): # -Dexec.mainClass='edu.lu.uni.serval.richedit.akka.compare.CompareTrees' # -Dexec.args='"+ " shape " + join(DATA_PATH,"redis") +" ALLdumps-gumInput.rdb " + # "clusterl0-gumInputALL.rdb /data/richedit-core/python/data/richEditScript'" - cmd = f"JAVA_HOME='{jdk8}' java -jar {JAR_PATH} {args.prop} COMPARE " + cmd = f"JAVA_HOME='{jdk8}' java -jar '{JAR_PATH}' {args.prop} COMPARE " output = shellCallTemplate4jar(cmd) logging.info(output) diff --git a/python/settings.py b/python/settings.py index 36523c2..da454f8 100644 --- a/python/settings.py +++ b/python/settings.py @@ -1,4 +1,5 @@ import os +from dataclasses import dataclass from pathlib import Path ROOT_DIR = os.environ["ROOT_DIR"]