From 9ff41d92b0772c0964db4293d5b430ba6f8bbb38 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Thu, 25 Nov 2021 11:56:20 -0500 Subject: [PATCH] [+] Implement @include-cut --- src/process/twitter_visualization.py | 6 +++--- src/report/report.py | 13 ++++++++++--- src/report/report_document.md | 4 ++++ src/utils.py | 12 ++++++++---- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py index 17a29ba..b375a87 100644 --- a/src/process/twitter_visualization.py +++ b/src/process/twitter_visualization.py @@ -234,7 +234,7 @@ def report_histograms(sample: Sample) -> None: x = [f.data for f in sample.frequencies] title = f'COVID-related posting frequency for {sample.name}' report_histogram(x, f'freq/{sample.name}-hist-outliers.png', title, False, 100) - x = [p for p in x if p > 0.0005] + x = [p for p in x if p > 0.001] report_histogram(x, f'freq/{sample.name}-hist.png', title, True) x = [f.data for f in sample.popularity_ratios] @@ -259,7 +259,7 @@ def report_stats(samples: list[Sample]) -> None: Reporter('pop/stats.md').table(table, [s.name for s in samples], True) xs = [[d.data for d in s.frequencies if d.data > 0.0005] for s in samples] - table = tabulate_stats([get_statistics(remove_outliers(x)) for x in xs]) + table = tabulate_stats([get_statistics(x) for x in xs], percent=True) Reporter('freq/stats.md').table(table, [s.name for s in samples], True) @@ -289,7 +289,7 @@ def report_all() -> None: debug('Creating reports...') report_ignored(samples) - report_pop_stats(samples) + report_stats(samples) for s in samples: report_top_20_tables(s) report_histograms(s) diff --git a/src/report/report.py b/src/report/report.py index 895e9da..7a5cbfc 100644 --- a/src/report/report.py +++ b/src/report/report.py @@ -24,9 +24,16 @@ def generate_report() -> str: for i in range(len(md)): line = md[i] if line.startswith('@include'): - line = line[line.index('`') + 1:] - line = line[:line.index('`')] - md[i] = read(REPORT_DIR + line) + path = line[line.index('`') + 1:] + path = path[:path.index('`')] + md[i] = read(REPORT_DIR + path) + + if line.startswith('@include-cut'): + args = [int(i) for i in line.split()[2:]] + if len(args) == 1: + md[i] = '\n'.join(md[i].split('\n')[args[0]:]) + if len(args) == 2: + md[i] = '\n'.join(md[i].split('\n')[args[0]:args[1]]) return '\n'.join(md) diff --git a/src/report/report_document.md b/src/report/report_document.md index 4466eab..878aa5a 100644 --- a/src/report/report_document.md +++ b/src/report/report_document.md @@ -24,6 +24,10 @@ We might graph the frequencies on a histogram to gain more insight: (You can cli
hist
+However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in the random sample: + +@include-cut `/freq/500-rand-top-20.md` 0 10 + ## COVID-19 Popularity Ratios To prevent division by zero, we ignored people who didn't post about COVID or didn't post at all. diff --git a/src/utils.py b/src/utils.py index a323afc..e23f936 100644 --- a/src/utils.py +++ b/src/utils.py @@ -187,16 +187,20 @@ def get_statistics(points: list[float]) -> Stats: return Stats(statistics.mean(points), statistics.median(points), statistics.stdev(points)) -def tabulate_stats(stats: list[Stats]) -> list[list[str]]: +def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]: """ Create a table structure from statistics for tabulate :param stats: Statistics + :param percent: Whether the numbers are percentages :return: Table for tabulate """ - return [['Mean'] + [f'{s.mean:.2f}' for s in stats], - ['Median'] + [f'{s.median:.2f}' for s in stats], - ['StdDev'] + [f'{s.stddev:.2f}' for s in stats]] + def num(n: float) -> str: + return f'{n:.2f}' if not percent else f'{n * 100:.1f}%' + + return [['Mean'] + [num(s.mean) for s in stats], + ['Median'] + [num(s.median) for s in stats], + ['StdDev'] + [num(s.stddev) for s in stats]] def parse_date(iso: str) -> datetime: