203 lines
7.5 KiB
Python
203 lines
7.5 KiB
Python
import gzip
|
|
import os
|
|
import pickle as p
|
|
import re
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
from operator import itemgetter
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from dateutil.relativedelta import relativedelta
|
|
from pandas import DataFrame
|
|
|
|
|
|
def get_root_nodes(date: str, projects):
|
|
result = sorted([])
|
|
for patch in os.listdir(f"../../{projects}/{date}/patterns"):
|
|
with open(f"../../{projects}/{date}/patterns/{patch}", 'r') as f:
|
|
result.append(f.readline())
|
|
return sorted(list(set(result)))
|
|
|
|
|
|
def get_keys_list(dict):
|
|
return list(map(itemgetter(0), dict.items()))
|
|
|
|
|
|
def get_all_commits_sha(repo, start: str, end: str) -> DataFrame:
|
|
start_date = datetime.strptime(start, '%Y-%m-%d').replace(tzinfo=timezone.utc)
|
|
end_date = datetime.strptime(end, '%Y-%m-%d').replace(tzinfo=timezone.utc)
|
|
loaded_pickle = (load_zipped_pickle(
|
|
f'{base_path}/{repo}-fix.pickle.gz'))
|
|
df = loaded_pickle[loaded_pickle['commitDate'].between(start_date, end_date, inclusive=False)]
|
|
|
|
return df
|
|
|
|
|
|
def get_all_fix_commits_sha(df) -> DataFrame:
|
|
return df[df['fixes'].apply(lambda x: len(x) > 0)]
|
|
|
|
|
|
def load_zipped_pickle(filename):
|
|
with gzip.open(filename, 'rb') as f:
|
|
loaded_object = p.load(f)
|
|
l = pd.DataFrame(loaded_object)
|
|
|
|
return l
|
|
|
|
|
|
def get_files(commits_df):
|
|
files = commits_df['files'].to_numpy()
|
|
RE = []
|
|
for filedict in files:
|
|
enum = get_keys_list(filedict)
|
|
RE += enum
|
|
return RE
|
|
|
|
|
|
def get_fix_files(commits_df):
|
|
files = commits_df['files'].to_numpy()
|
|
NFCPC = []
|
|
RE = []
|
|
for filedict in files:
|
|
enum = get_keys_list(filedict)
|
|
NFCPC.append(len(enum))
|
|
RE += enum
|
|
return RE, NFCPC
|
|
|
|
|
|
def get_change_loc(project, sha_1, sha_2):
|
|
result = [0, 0, 0, 0]
|
|
# files, add, remove, chrun
|
|
# print(f'location: /workspace/EECS-Research/data-source/repos/{project}')
|
|
# print(f'git diff --stat {sha_1} {sha_2}')
|
|
status = subprocess.check_output(f'git diff --stat {sha_1} {sha_2}',
|
|
cwd=f"/workspace/EECS-Research/data-source/repos/{project}",
|
|
shell=True).decode().splitlines()
|
|
if (len(status) == 0):
|
|
return result
|
|
status = status[len(status) - 1]
|
|
insertion = re.findall(r'\d+ insertions', status)
|
|
deletions = re.findall(r'\d+ deletions', status)
|
|
files = re.findall(r'\d+ file', status)
|
|
# print(files)
|
|
result[0] = int(files[0].split(' ')[0])
|
|
if len(insertion) != 0:
|
|
result[1] = int(insertion[0].split(' ')[0])
|
|
if len(deletions) != 0:
|
|
result[2] = int(deletions[0].split(' ')[0])
|
|
# print(result)
|
|
result[3] = int(result[1]) - int(result[2])
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
datasets = 'data-all.absolute'
|
|
name = datasets
|
|
# name = 'test'
|
|
print("fuck")
|
|
date = '2001-06-01'
|
|
base_path = Path(f'/workspace/EECS-Research/{datasets}/{date}/commitsDF/')
|
|
|
|
start_date = datetime.strptime(date, '%Y-%m-%d')
|
|
interval = relativedelta(months=6)
|
|
repos = [str(f).replace('-fix.pickle.gz', '') for f in os.listdir(base_path) if f.endswith('-fix.pickle.gz')]
|
|
# repos = ['commons-compress']
|
|
features = []
|
|
while True:
|
|
# end = start + interval
|
|
end_date = start_date + interval
|
|
end_string = end_date.strftime('%Y-%m-%d').split(' ')[0]
|
|
start_string = start_date.strftime('%Y-%m-%d').split(' ')[0]
|
|
|
|
if not os.path.isdir(Path(f'/workspace/EECS-Research/{datasets}/{end_string}/commitsDF/')):
|
|
# new = os.listdir(data_path / f"{end_string}/patterns")
|
|
# added = sorted(list(set(new)))
|
|
# remove = sorted([])
|
|
# csv.append((end_string, len(new), added, remove))
|
|
break
|
|
|
|
total_commits = 0
|
|
total_files = []
|
|
total_fix_files = []
|
|
total_files_each_fix = []
|
|
total_author = []
|
|
total_add_loc = []
|
|
total_del_loc = []
|
|
total_churn = []
|
|
total_commits_fix = 0
|
|
for repo in repos:
|
|
|
|
# print(f'processing {repo}, bewtween {start_string}, {end_string}')
|
|
|
|
commits_timeframe = get_all_commits_sha(repo, start_string, end_string)
|
|
# print(commits_timeframe)
|
|
commits_fix = get_all_fix_commits_sha(commits_timeframe)
|
|
total_commits_fix += len(commits_fix)
|
|
# print(commits_fix)
|
|
# print(len(commits_fix))
|
|
# print(commits_sha)
|
|
if len(commits_timeframe) == 0:
|
|
continue
|
|
commits_sha = commits_timeframe['commit'].to_numpy()
|
|
for i in range(1, len(commits_sha)):
|
|
locs = get_change_loc(repo, commits_sha[i - 1], commits_sha[i])
|
|
total_add_loc.append(locs[1])
|
|
total_del_loc.append(locs[2])
|
|
total_churn.append(locs[3])
|
|
NR = len(commits_sha)
|
|
total_commits += NR
|
|
|
|
# print(NR)
|
|
authors = commits_timeframe['committer'].to_numpy()
|
|
AUTH = []
|
|
for author in authors:
|
|
AUTH.append(author)
|
|
|
|
total_author += AUTH
|
|
files = commits_timeframe['files'].to_numpy()
|
|
RE = get_files(commits_timeframe)
|
|
|
|
total_files += RE
|
|
if len(RE) != 0:
|
|
NREF = len(RE) / len(set(RE))
|
|
if len(commits_fix != 0):
|
|
files, NFCPC = get_fix_files(commits_fix)
|
|
total_fix_files += files
|
|
total_files_each_fix+=NFCPC
|
|
# print(NREF)
|
|
# print(total_add_loc)
|
|
total_add_loc = np.asarray(total_add_loc)
|
|
total_del_loc = np.asarray(total_del_loc)
|
|
total_files_each_fix = np.asarray(total_files_each_fix)
|
|
total_churn = np.asarray(total_churn)
|
|
TNREF = len(total_files) / len(set(total_files)) if len(total_files) != 0 else len(total_files)
|
|
new = get_root_nodes(end_string, datasets)
|
|
old = get_root_nodes(start_string, datasets)
|
|
added = sorted(list(set(new) - set(old)))
|
|
#print(total_files_each_fix)
|
|
NFIX = len(set(total_fix_files))
|
|
NFEFC = (len(total_fix_files) / total_commits_fix) if total_commits_fix != 0 else 0
|
|
feature = [end_string, total_commits, TNREF, NFIX, total_commits_fix, NFEFC, total_files_each_fix.max(),
|
|
len(set(total_author)),
|
|
total_add_loc.mean(), total_add_loc.max(),
|
|
total_del_loc.mean(), total_del_loc.max(), total_churn.mean(), total_churn.max(), total_churn.min(),
|
|
len(added) != 0]
|
|
print(feature)
|
|
features.append(feature)
|
|
start_date += interval
|
|
df = pd.DataFrame(features, columns=('Time', 'NR', 'NREF', 'NFIX', 'NCFIX', 'NFEFC_AVG', 'NFEFC_MAX', 'NAUTH',
|
|
'LOC_ADD_AVG', 'LOC_ADD_MAX', 'LOC_DEL_AVG', 'LOC_DEL_MAX', 'CHURN_AVG',
|
|
'CHURN_MAX',
|
|
'CHURN_MIN', 'LABEL'))
|
|
df.to_csv(f'{name}-features.csv')
|
|
|
|
# NR: Number of revisions: commits between these timeline
|
|
# NREF Number refreactor each file: totoal changed file/ unique changed file
|
|
# NFIX NUmber of file in bug fix /number of files called 'fix'
|
|
# Nauth number of author who commited , number of authors count by commit
|
|
# LOC_ADD LOC_DEL line add, removed max,min,avg
|
|
# churn added loc-deleted loc
|
|
# chgser change set number of files , number of files changed per commit, max and average
|