78 lines
2.5 KiB
Python
78 lines
2.5 KiB
Python
|
|
|
|
from common.commons import *
|
|
|
|
REPO_PATH = os.environ["REPO_PATH"]
|
|
DATA_PATH = os.environ["DATA_PATH"]
|
|
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
|
COMMIT_FOLDER = os.environ["COMMIT_FOLDER"]
|
|
|
|
def getCommitFromRepo(f,gitrepo,branch):
|
|
cmd = 'git -C ' + f + ' checkout -f ' + branch
|
|
|
|
output, err = shellGitCheckout(cmd)
|
|
m = re.search(branch, err)
|
|
|
|
while not m:
|
|
time.sleep(10)
|
|
logging.info('Waiting for checkout')
|
|
cmd = 'git -C ' + f + " log --no-merges --pretty=format:'{\"commit\":\"%H\",\"commitDate\":\"%ci\",\"title\":\"%f\",\"committer\":\"%ce\"}' > " + gitrepo + '.commits'
|
|
output = shellCallTemplate(cmd,enc='latin1')
|
|
|
|
|
|
def makeDF(filename):
|
|
with open(filename,encoding='latin1') as f:
|
|
lines = f.readlines()
|
|
ls = [eval(f) for f in lines]
|
|
ds = pd.DataFrame.from_dict(ls)
|
|
ds['commitDate']= ds['commitDate'].apply(lambda x:pd.to_datetime(x))
|
|
return ds
|
|
|
|
|
|
def caseCollect(subject):
|
|
|
|
if not os.path.exists(COMMIT_FOLDER):
|
|
os.mkdir(COMMIT_FOLDER)
|
|
if not os.path.exists(COMMIT_DFS):
|
|
os.mkdir(COMMIT_DFS)
|
|
|
|
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
|
if subject == 'ALL':
|
|
tuples = subjects[['Repo', 'Branch']].values.tolist()
|
|
else:
|
|
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
|
|
tuples = subjects.query("Subject == '{0}'".format(subject))[['Repo', 'Branch']].values.tolist()
|
|
|
|
for t in tuples:
|
|
repo,branch = t
|
|
logging.info(repo)
|
|
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_FOLDER, repo),branch)
|
|
|
|
if subject == 'ALL':
|
|
commits = listdir(COMMIT_FOLDER)
|
|
else:
|
|
commits = [i for i in listdir(COMMIT_FOLDER) if i.startswith(repo)]
|
|
|
|
for commit in commits:
|
|
logging.info(commit)
|
|
rDF = makeDF(join(COMMIT_FOLDER, commit))
|
|
repoName = commit.split('.')[0]
|
|
save_zipped_pickle(rDF, join(COMMIT_DFS, repoName + ".pickle"))
|
|
# p.dump(rDF, open(join(COMMIT_DFS, repoName + ".pickle"), "wb"))
|
|
|
|
def caseClone(subject):
|
|
if not os.path.exists(REPO_PATH):
|
|
os.mkdir(REPO_PATH)
|
|
|
|
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
|
if subject == 'ALL':
|
|
gitrepos = subjects.GitRepo.tolist()
|
|
else:
|
|
gitrepos = subjects.query("Subject == '{0}'".format(subject)).GitRepo.tolist()
|
|
os.getcwd()
|
|
os.chdir(REPO_PATH)
|
|
for gitrepo in gitrepos:
|
|
cmd = 'git clone ' + gitrepo
|
|
out = shellCallTemplate(cmd)
|
|
|