Files
fixminer_source/python/commitCollector.py
T
2022-05-08 23:54:51 -04:00

88 lines
2.6 KiB
Python

import json
import pandas as pd
from common.commons import *
REPO_PATH = os.environ["REPO_PATH"]
DATA_PATH = os.environ["DATA_PATH"]
COMMIT_DFS = os.environ["COMMIT_DFS"]
COMMIT_FOLDER = os.environ["COMMIT_FOLDER"]
def getCommitFromRepo(f: PathLike, gitrepo: str, branch: str):
"""
:param f: Git repo directory
:param gitrepo: Repo name
:param branch: Branch name
:return: None
"""
file = f'{gitrepo}.commits'
output, err = shellGitCheckout(f'git -C {f} checkout -f {branch}')
m = re.search(branch, err)
while not m:
time.sleep(10)
logging.info('Waiting for checkout')
# Create commits file
form = json.dumps({"commit": "%H", "commitDate": "%ci", "title": "%f", "committer": "%ce"})
shellCallTemplate(f"git -C {f} log --no-merges --pretty=format:'{form}' > {file}", enc='latin1')
# Collect commits
commits = json.loads(f'[{Path(file).read_text()}]')
# Convert to DataFrame
ds = pd.DataFrame.from_dict(commits)
ds['commitDate'] = pd.to_datetime(ds['commitDate'])
return ds
def caseCollect(subject):
if not os.path.exists(COMMIT_FOLDER):
os.mkdir(COMMIT_FOLDER)
if not os.path.exists(COMMIT_DFS):
os.mkdir(COMMIT_DFS)
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
if subject == 'ALL':
tuples = subjects[['Repo', 'Branch']].values.tolist()
else:
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
tuples = subjects.query("Subject == '{0}'".format(subject))[
['Repo', 'Branch']].values.tolist()
for t in tuples:
repo, branch = t
logging.info(repo)
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo), branch)
if subject == 'ALL':
commits = listdir(COMMIT_FOLDER)
else:
commits = [i for i in listdir(COMMIT_FOLDER) if i.startswith(repo)]
for commit in commits:
logging.info(commit)
rDF = makeDF(join(COMMIT_FOLDER, commit))
repoName = commit.split('.')[0]
save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle"))
# p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb"))
def caseClone(subject):
if not os.path.exists(REPO_PATH):
os.mkdir(REPO_PATH)
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
if subject == 'ALL':
gitrepos = subjects.GitRepo.tolist()
else:
gitrepos = subjects.query("Subject == '{0}'".format(subject)).GitRepo.tolist()
os.getcwd()
os.chdir(REPO_PATH)
for gitrepo in gitrepos:
cmd = 'git clone ' + gitrepo
out = shellCallTemplate(cmd)