[+] Implement @include-cut

2021-11-25 11:56:20 -05:00
parent 82afe91d11
commit 9ff41d92b0
4 changed files with 25 additions and 10 deletions
@@ -234,7 +234,7 @@ def report_histograms(sample: Sample) -> None:
    x = [f.data for f in sample.frequencies]
    title = f'COVID-related posting frequency for {sample.name}'
    report_histogram(x, f'freq/{sample.name}-hist-outliers.png', title, False, 100)
-    x = [p for p in x if p > 0.0005]
+    x = [p for p in x if p > 0.001]
    report_histogram(x, f'freq/{sample.name}-hist.png', title, True)

    x = [f.data for f in sample.popularity_ratios]
@@ -259,7 +259,7 @@ def report_stats(samples: list[Sample]) -> None:
    Reporter('pop/stats.md').table(table, [s.name for s in samples], True)

    xs = [[d.data for d in s.frequencies if d.data > 0.0005] for s in samples]
-    table = tabulate_stats([get_statistics(remove_outliers(x)) for x in xs])
+    table = tabulate_stats([get_statistics(x) for x in xs], percent=True)
    Reporter('freq/stats.md').table(table, [s.name for s in samples], True)


@@ -289,7 +289,7 @@ def report_all() -> None:
    debug('Creating reports...')

    report_ignored(samples)
-    report_pop_stats(samples)
+    report_stats(samples)
    for s in samples:
        report_top_20_tables(s)
        report_histograms(s)
@@ -24,9 +24,16 @@ def generate_report() -> str:
    for i in range(len(md)):
        line = md[i]
        if line.startswith('@include'):
-            line = line[line.index('`') + 1:]
-            line = line[:line.index('`')]
-            md[i] = read(REPORT_DIR + line)
+            path = line[line.index('`') + 1:]
+            path = path[:path.index('`')]
+            md[i] = read(REPORT_DIR + path)
+
+            if line.startswith('@include-cut'):
+                args = [int(i) for i in line.split()[2:]]
+                if len(args) == 1:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:])
+                if len(args) == 2:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:args[1]])

    return '\n'.join(md)

@@ -24,6 +24,10 @@ We might graph the frequencies on a histogram to gain more insight: (You can cli
    <div><img src="/freq/eng-news-hist-outliers.png" alt="hist"></div>
 </div>

+However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in the random sample:
+
+@include-cut `/freq/500-rand-top-20.md` 0 10
+
 ## COVID-19 Popularity Ratios

 To prevent division by zero, we ignored people who didn't post about COVID or didn't post at all.
@@ -187,16 +187,20 @@ def get_statistics(points: list[float]) -> Stats:
    return Stats(statistics.mean(points), statistics.median(points), statistics.stdev(points))


-def tabulate_stats(stats: list[Stats]) -> list[list[str]]:
+def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]:
    """
    Create a table structure from statistics for tabulate

    :param stats: Statistics
+    :param percent: Whether the numbers are percentages
    :return: Table for tabulate
    """
-    return [['Mean'] + [f'{s.mean:.2f}' for s in stats],
-            ['Median'] + [f'{s.median:.2f}' for s in stats],
-            ['StdDev'] + [f'{s.stddev:.2f}' for s in stats]]
+    def num(n: float) -> str:
+        return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
+
+    return [['Mean'] + [num(s.mean) for s in stats],
+            ['Median'] + [num(s.median) for s in stats],
+            ['StdDev'] + [num(s.stddev) for s in stats]]


 def parse_date(iso: str) -> datetime: