[+] Implement @include-cut

This commit is contained in:
Hykilpikonna
2021-11-25 11:56:20 -05:00
parent 82afe91d11
commit 9ff41d92b0
4 changed files with 25 additions and 10 deletions
+3 -3
View File
@@ -234,7 +234,7 @@ def report_histograms(sample: Sample) -> None:
x = [f.data for f in sample.frequencies]
title = f'COVID-related posting frequency for {sample.name}'
report_histogram(x, f'freq/{sample.name}-hist-outliers.png', title, False, 100)
x = [p for p in x if p > 0.0005]
x = [p for p in x if p > 0.001]
report_histogram(x, f'freq/{sample.name}-hist.png', title, True)
x = [f.data for f in sample.popularity_ratios]
@@ -259,7 +259,7 @@ def report_stats(samples: list[Sample]) -> None:
Reporter('pop/stats.md').table(table, [s.name for s in samples], True)
xs = [[d.data for d in s.frequencies if d.data > 0.0005] for s in samples]
table = tabulate_stats([get_statistics(remove_outliers(x)) for x in xs])
table = tabulate_stats([get_statistics(x) for x in xs], percent=True)
Reporter('freq/stats.md').table(table, [s.name for s in samples], True)
@@ -289,7 +289,7 @@ def report_all() -> None:
debug('Creating reports...')
report_ignored(samples)
report_pop_stats(samples)
report_stats(samples)
for s in samples:
report_top_20_tables(s)
report_histograms(s)
+10 -3
View File
@@ -24,9 +24,16 @@ def generate_report() -> str:
for i in range(len(md)):
line = md[i]
if line.startswith('@include'):
line = line[line.index('`') + 1:]
line = line[:line.index('`')]
md[i] = read(REPORT_DIR + line)
path = line[line.index('`') + 1:]
path = path[:path.index('`')]
md[i] = read(REPORT_DIR + path)
if line.startswith('@include-cut'):
args = [int(i) for i in line.split()[2:]]
if len(args) == 1:
md[i] = '\n'.join(md[i].split('\n')[args[0]:])
if len(args) == 2:
md[i] = '\n'.join(md[i].split('\n')[args[0]:args[1]])
return '\n'.join(md)
+4
View File
@@ -24,6 +24,10 @@ We might graph the frequencies on a histogram to gain more insight: (You can cli
<div><img src="/freq/eng-news-hist-outliers.png" alt="hist"></div>
</div>
However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in the random sample:
@include-cut `/freq/500-rand-top-20.md` 0 10
## COVID-19 Popularity Ratios
To prevent division by zero, we ignored people who didn't post about COVID or didn't post at all.
+8 -4
View File
@@ -187,16 +187,20 @@ def get_statistics(points: list[float]) -> Stats:
return Stats(statistics.mean(points), statistics.median(points), statistics.stdev(points))
def tabulate_stats(stats: list[Stats]) -> list[list[str]]:
def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]:
"""
Create a table structure from statistics for tabulate
:param stats: Statistics
:param percent: Whether the numbers are percentages
:return: Table for tabulate
"""
return [['Mean'] + [f'{s.mean:.2f}' for s in stats],
['Median'] + [f'{s.median:.2f}' for s in stats],
['StdDev'] + [f'{s.stddev:.2f}' for s in stats]]
def num(n: float) -> str:
return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
return [['Mean'] + [num(s.mean) for s in stats],
['Median'] + [num(s.median) for s in stats],
['StdDev'] + [num(s.stddev) for s in stats]]
def parse_date(iso: str) -> datetime: