[O] Naming convensions
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
@@ -46,13 +48,14 @@ def filter_commits(commits: DataFrame, end_date: date) -> DataFrame:
|
|||||||
return commits[commits.commitDate < end_date]
|
return commits[commits.commitDate < end_date]
|
||||||
|
|
||||||
|
|
||||||
def createDS(project_list: str = PROJECT_LIST):
|
def create_dataset(project_list: str = PROJECT_LIST):
|
||||||
"""
|
"""
|
||||||
|
Create dataset
|
||||||
|
|
||||||
:param project_list: Comma-separated list of git project names (projects must exist in dataset.csv)
|
:param project_list: Comma-separated list of git project names (projects must exist in dataset.csv)
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pjList: list[str] = project_list.split(',')
|
pj_list: list[str] = project_list.split(',')
|
||||||
|
|
||||||
# Ensure directories exist
|
# Ensure directories exist
|
||||||
DATASET_PATH.mkdir(exist_ok=True)
|
DATASET_PATH.mkdir(exist_ok=True)
|
||||||
@@ -60,16 +63,17 @@ def createDS(project_list: str = PROJECT_LIST):
|
|||||||
os.mkdir(COMMIT_DFS)
|
os.mkdir(COMMIT_DFS)
|
||||||
|
|
||||||
# Find project repo urls in dataset.csv
|
# Find project repo urls in dataset.csv
|
||||||
subjects: DataFrame = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv'))
|
dataset: DataFrame = pd.read_csv(join(ROOT_DIR, 'data', 'dataset.csv'))
|
||||||
if pjList == ['ALL']:
|
if pj_list == ['ALL']:
|
||||||
tuples = subjects[['Repo', 'GitRepo', 'Branch']].values.tolist()
|
repos = dataset[['Repo', 'GitRepo', 'Branch']].values.tolist()
|
||||||
else:
|
else:
|
||||||
tuples = subjects[subjects.Repo.isin(pjList)][['Repo', 'GitRepo', 'Branch']].values.tolist()
|
repos = dataset[dataset.Repo.isin(pj_list)][['Repo', 'GitRepo', 'Branch']].values.tolist()
|
||||||
|
|
||||||
# Loop through repos
|
# Loop through repos
|
||||||
for repo, src, branch in tuples:
|
for repo, src, branch in repos:
|
||||||
logging.info(f'Processing {repo}')
|
print(f'Processing {repo}')
|
||||||
commits = load_commits(repo, src, branch)
|
commits = load_commits(repo, src, branch)
|
||||||
|
print(f'> Obtained {len(commits)} commits.')
|
||||||
|
|
||||||
# keep only commits that has moves
|
# keep only commits that has moves
|
||||||
commits = commits[[any(c == 'M' for c in dic.values()) for dic in commits.files]]
|
commits = commits[[any(c == 'M' for c in dic.values()) for dic in commits.files]]
|
||||||
@@ -86,6 +90,6 @@ def createDS(project_list: str = PROJECT_LIST):
|
|||||||
# bugs = set(fixes).union(links).union(coccis)
|
# bugs = set(fixes).union(links).union(coccis)
|
||||||
# bugs = set(fixes)#.union(coccis)
|
# bugs = set(fixes)#.union(coccis)
|
||||||
commits = commits[commits.commit.isin(fixes)]
|
commits = commits[commits.commit.isin(fixes)]
|
||||||
print(len(commits))
|
print(f'> Has {len(commits)} comments after filtering')
|
||||||
# for s in a.commit.values.tolist():
|
|
||||||
parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), repo)
|
parallelRun(prepareFiles, commits[['commit', 'files']].values.tolist(), repo)
|
||||||
+2
-2
@@ -18,9 +18,9 @@ if __name__ == '__main__':
|
|||||||
print(job)
|
print(job)
|
||||||
|
|
||||||
if job == 'dataset4j':
|
if job == 'dataset4j':
|
||||||
from javaDS import createDS
|
from dataset4j import create_dataset
|
||||||
|
|
||||||
createDS()
|
create_dataset()
|
||||||
|
|
||||||
elif job == 'dataset4c':
|
elif job == 'dataset4c':
|
||||||
from otherDatasets import core
|
from otherDatasets import core
|
||||||
|
|||||||
Reference in New Issue
Block a user