Files
fixminer_source/experiments/commit_dates.py
T
Azalea (on HyDEV-Daisy) 428d5afc58 [U] Update
2022-10-01 13:34:56 -04:00

53 lines
1.7 KiB
Python

from __future__ import annotations
import os
from datetime import datetime, timezone, date
from pathlib import Path
import numpy as np
import pandas
from matplotlib import pyplot as plt
from pandas import DataFrame
if __name__ == '__main__':
path = Path('../../data.relative/0/commitsDF')
project_list=[p.replace('-fix.pickle.gz','') for p in os.listdir(path) if
p.endswith('pickle.gz')]
dates_by_repo: list[list[datetime]] = \
[list(pandas.read_pickle(path / str(p)).commitDate) for p in os.listdir(path) if
p.endswith('pickle.gz')]
dates = np.hstack(dates_by_repo)
plt.hist(dates, bins=50)
plt.xlabel('Date')
plt.ylabel('Total Commits')
plt.xlim([date(2001, 1, 1), date(2022, 6, 1)])
plt.savefig("fm-dates.png")
plt.show()
first_dates = [cs[-1] for cs in dates_by_repo]
plt.hist(first_dates, bins=20)
plt.xlabel('First Commit Date')
plt.xlim([date(2001, 1, 1), date(2022, 6, 1)])
#plt.show()
last_dates = [cs[0] for cs in dates_by_repo]
plt.hist(last_dates, bins=20)
plt.xlabel('Last Commit Date')
plt.xlim([date(2001, 1, 1), date(2022, 6, 1)])
#plt.show()
mean_dates = [datetime.fromtimestamp(
float(np.mean([c.replace(tzinfo=timezone.utc).timestamp() for c in cs]))) for cs in
dates_by_repo]
plt.hist(mean_dates, bins=20)
plt.xlabel('Mean Commit Date')
plt.xlim([date(2001, 1, 1), date(2022, 6, 1)])
#plt.show()
csv = []
for i in range(len(first_dates)):
csv.append((project_list[i],first_dates[i], last_dates[i]))
df = DataFrame(csv, columns=('project','first_dates','last commit dates'))
df.to_csv('commits-dates.csv')