[O] Reformat, optimize, add comments
This commit is contained in:
+27
-17
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from common.commons import *
|
||||
|
||||
@@ -7,46 +9,54 @@ DATA_PATH = os.environ["DATA_PATH"]
|
||||
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
||||
COMMIT_FOLDER = os.environ["COMMIT_FOLDER"]
|
||||
|
||||
def getCommitFromRepo(f,gitrepo,branch):
|
||||
cmd = 'git -C ' + f + ' checkout -f ' + branch
|
||||
|
||||
output, err = shellGitCheckout(cmd)
|
||||
def getCommitFromRepo(f: PathLike, gitrepo: str, branch: str):
|
||||
"""
|
||||
|
||||
:param f: Git repo directory
|
||||
:param gitrepo: Repo name
|
||||
:param branch: Branch name
|
||||
:return: None
|
||||
"""
|
||||
file = f'{gitrepo}.commits'
|
||||
output, err = shellGitCheckout(f'git -C {f} checkout -f {branch}')
|
||||
m = re.search(branch, err)
|
||||
|
||||
while not m:
|
||||
time.sleep(10)
|
||||
logging.info('Waiting for checkout')
|
||||
cmd = 'git -C ' + f + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + gitrepo + '.commits'
|
||||
output = shellCallTemplate(cmd,enc='latin1')
|
||||
|
||||
# Create commits file
|
||||
form = json.dumps({"commit": "%H", "commitDate": "%ci", "title": "%f", "committer": "%ce"})
|
||||
shellCallTemplate(f"git -C {f} log --no-merges --pretty=format:'{form}' > {file}", enc='latin1')
|
||||
|
||||
def makeDF(filename):
|
||||
with open(filename,encoding='latin1') as f:
|
||||
lines = f.readlines()
|
||||
ls = [eval(f) for f in lines]
|
||||
ds = pd.DataFrame.from_dict(ls)
|
||||
ds['commitDate']= ds['commitDate'].apply(lambda x:pd.to_datetime(x))
|
||||
# Collect commits
|
||||
commits = json.loads(f'[{Path(file).read_text()}]')
|
||||
|
||||
# Convert to DataFrame
|
||||
ds = pd.DataFrame.from_dict(commits)
|
||||
ds['commitDate'] = pd.to_datetime(ds['commitDate'])
|
||||
return ds
|
||||
|
||||
|
||||
def caseCollect(subject):
|
||||
|
||||
if not os.path.exists(COMMIT_FOLDER):
|
||||
os.mkdir(COMMIT_FOLDER)
|
||||
if not os.path.exists(COMMIT_DFS):
|
||||
os.mkdir(COMMIT_DFS)
|
||||
|
||||
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
if subject == 'ALL':
|
||||
tuples = subjects[['Repo', 'Branch']].values.tolist()
|
||||
else:
|
||||
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
|
||||
tuples = subjects.query("Subject == '{0}'".format(subject))[['Repo', 'Branch']].values.tolist()
|
||||
tuples = subjects.query("Subject == '{0}'".format(subject))[
|
||||
['Repo', 'Branch']].values.tolist()
|
||||
|
||||
for t in tuples:
|
||||
repo,branch = t
|
||||
repo, branch = t
|
||||
logging.info(repo)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo),branch)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo), branch)
|
||||
|
||||
if subject == 'ALL':
|
||||
commits = listdir(COMMIT_FOLDER)
|
||||
@@ -60,6 +70,7 @@ def caseCollect(subject):
|
||||
save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle"))
|
||||
# p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb"))
|
||||
|
||||
|
||||
def caseClone(subject):
|
||||
if not os.path.exists(REPO_PATH):
|
||||
os.mkdir(REPO_PATH)
|
||||
@@ -74,4 +85,3 @@ def caseClone(subject):
|
||||
for gitrepo in gitrepos:
|
||||
cmd = 'git clone ' + gitrepo
|
||||
out = shellCallTemplate(cmd)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user