From 9ff41d92b0772c0964db4293d5b430ba6f8bbb38 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Thu, 25 Nov 2021 11:56:20 -0500
Subject: [PATCH] [+] Implement @include-cut

---
 src/process/twitter_visualization.py |  6 +++---
 src/report/report.py                 | 13 ++++++++++---
 src/report/report_document.md        |  4 ++++
 src/utils.py                         | 12 ++++++++----
 4 files changed, 25 insertions(+), 10 deletions(-)
diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py
index 17a29ba..b375a87 100644
--- a/src/process/twitter_visualization.py
+++ b/src/process/twitter_visualization.py
@@ -234,7 +234,7 @@ def report_histograms(sample: Sample) -> None:
     x = [f.data for f in sample.frequencies]
     title = f'COVID-related posting frequency for {sample.name}'
     report_histogram(x, f'freq/{sample.name}-hist-outliers.png', title, False, 100)
-    x = [p for p in x if p > 0.0005]
+    x = [p for p in x if p > 0.001]
     report_histogram(x, f'freq/{sample.name}-hist.png', title, True)
 
     x = [f.data for f in sample.popularity_ratios]
@@ -259,7 +259,7 @@ def report_stats(samples: list[Sample]) -> None:
     Reporter('pop/stats.md').table(table, [s.name for s in samples], True)
 
     xs = [[d.data for d in s.frequencies if d.data > 0.0005] for s in samples]
-    table = tabulate_stats([get_statistics(remove_outliers(x)) for x in xs])
+    table = tabulate_stats([get_statistics(x) for x in xs], percent=True)
     Reporter('freq/stats.md').table(table, [s.name for s in samples], True)
 
 
@@ -289,7 +289,7 @@ def report_all() -> None:
     debug('Creating reports...')
 
     report_ignored(samples)
-    report_pop_stats(samples)
+    report_stats(samples)
     for s in samples:
         report_top_20_tables(s)
         report_histograms(s)
diff --git a/src/report/report.py b/src/report/report.py
index 895e9da..7a5cbfc 100644
--- a/src/report/report.py
+++ b/src/report/report.py
@@ -24,9 +24,16 @@ def generate_report() -> str:
     for i in range(len(md)):
         line = md[i]
         if line.startswith('@include'):
-            line = line[line.index('`') + 1:]
-            line = line[:line.index('`')]
-            md[i] = read(REPORT_DIR + line)
+            path = line[line.index('`') + 1:]
+            path = path[:path.index('`')]
+            md[i] = read(REPORT_DIR + path)
+
+            if line.startswith('@include-cut'):
+                args = [int(i) for i in line.split()[2:]]
+                if len(args) == 1:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:])
+                if len(args) == 2:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:args[1]])
 
     return '\n'.join(md)
 
diff --git a/src/report/report_document.md b/src/report/report_document.md
index 4466eab..878aa5a 100644
--- a/src/report/report_document.md
+++ b/src/report/report_document.md
@@ -24,6 +24,10 @@ We might graph the frequencies on a histogram to gain more insight: (You can cli
     <div><img src="/freq/eng-news-hist-outliers.png" alt="hist"></div>
 </div>
 
+However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in the random sample:
+
+@include-cut `/freq/500-rand-top-20.md` 0 10
+
 ## COVID-19 Popularity Ratios
 
 To prevent division by zero, we ignored people who didn't post about COVID or didn't post at all.
diff --git a/src/utils.py b/src/utils.py
index a323afc..e23f936 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -187,16 +187,20 @@ def get_statistics(points: list[float]) -> Stats:
     return Stats(statistics.mean(points), statistics.median(points), statistics.stdev(points))
 
 
-def tabulate_stats(stats: list[Stats]) -> list[list[str]]:
+def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]:
     """
     Create a table structure from statistics for tabulate
 
     :param stats: Statistics
+    :param percent: Whether the numbers are percentages
     :return: Table for tabulate
     """
-    return [['Mean'] + [f'{s.mean:.2f}' for s in stats],
-            ['Median'] + [f'{s.median:.2f}' for s in stats],
-            ['StdDev'] + [f'{s.stddev:.2f}' for s in stats]]
+    def num(n: float) -> str:
+        return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
+
+    return [['Mean'] + [num(s.mean) for s in stats],
+            ['Median'] + [num(s.median) for s in stats],
+            ['StdDev'] + [num(s.stddev) for s in stats]]
 
 
 def parse_date(iso: str) -> datetime: