From 4ac3b94c04eb3ba8c0eb25614bd3c59a7dbbc366 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Thu, 25 Nov 2021 12:41:01 -0500 Subject: [PATCH] [+] Add Q25 Q75, IQR calculations --- src/report/report_document.md | 26 +++++++++++++++++++++++++- src/utils.py | 16 +++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/report/report_document.md b/src/report/report_document.md index 878aa5a..1f22f4a 100644 --- a/src/report/report_document.md +++ b/src/report/report_document.md @@ -24,10 +24,24 @@ We might graph the frequencies on a histogram to gain more insight: (You can cli
hist
-However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in the random sample: +However, as you can see, the graphs are not very helpful because the majority of the sample post below 0.1%, and there are many outliers who post very frequently, like 40%. For example, if we sort the samples by their frequency, we have a few outliers who post more than 20% even in `500-rand`: @include-cut `/freq/500-rand-top-20.md` 0 10 +So, we removed the outliers using the method proposed by Boris Iglewicz and David Hoaglin (1993) [[1]](#ref1) and ignoring everyone who posted below 0.1% and graphed the same histogram again: + +
+
hist
+
hist
+
hist
+
+ +As expected, the distributions looks right-skewed, with most people posting not very much. One interesting distinction is that, even though the distributions follow similar shapes, the x-axis ticks of `eng-news` is actually ten times larger than the other two, which means that `eng-news` post a lot more about COVID-19 on average than the other two samples. We can calculate some statistics of the samples to further verify this: + +@include `/freq/stats.md` + +Since there are many outliers, medians will more accurately represent the + ## COVID-19 Popularity Ratios To prevent division by zero, we ignored people who didn't post about COVID or didn't post at all. @@ -39,3 +53,13 @@ Test Include: @include `/pop/ignored.md` @include `/pop/stats-with-outliers.md` + + +## References + + + +[1] Iglewicz, Boris, & David Hoaglin (1993), "Volume 16: How to Detect and +Handle Outliers", _The ASQC Basic References in Quality Control: +Statistical Techniques_, Edward F. Mykytka, Ph.D., Editor. + diff --git a/src/utils.py b/src/utils.py index e23f936..853ac2b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -173,8 +173,11 @@ def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float class Stats(NamedTuple): mean: float - median: float stddev: float + median: float + iqr: float + q25: float + q75: float def get_statistics(points: list[float]) -> Stats: @@ -184,7 +187,10 @@ def get_statistics(points: list[float]) -> Stats: :param points: Input points :return: Statistics """ - return Stats(statistics.mean(points), statistics.median(points), statistics.stdev(points)) + q75, q25 = np.percentile(points, [75, 25]) + iqr = q75 - q25 + return Stats(statistics.mean(points), statistics.stdev(points), statistics.median(points), + iqr, q75, q25) def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]: @@ -199,8 +205,12 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]] return f'{n:.2f}' if not percent else f'{n * 100:.1f}%' return [['Mean'] + [num(s.mean) for s in stats], + ['StdDev'] + [num(s.stddev) for s in stats], ['Median'] + [num(s.median) for s in stats], - ['StdDev'] + [num(s.stddev) for s in stats]] + ['IQR'] + [num(s.iqr) for s in stats], + ['Q25%'] + [num(s.q25) for s in stats], + ['Q75%'] + [num(s.q75) for s in stats], + ] def parse_date(iso: str) -> datetime: