From 0f98b70509c51cca007311fe8225444f8b8adc8e Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Fri, 26 Nov 2021 22:20:20 -0500
Subject: [PATCH] [U] Update code

---
 CITATION.cff                         | 12 +++++++++
 src/process/twitter_visualization.py | 32 +++++++++++++++++-------
 src/raw_collect/others.py            | 29 ++++++++++++++++++++++
 src/report/report_document.md        | 21 +++++++++++++++-
 src/utils.py                         | 37 ++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+), 10 deletions(-)
 create mode 100644 CITATION.cff
 create mode 100644 src/raw_collect/others.py

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..9cc6bcb
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,12 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: Gui
+    given-names: Azalea
+    orcid: https://orcid.org/0000-0002-6141-5926
+  - family-names: Lin
+    given-names: Peter
+title: "COVID-19 Twitter Posting Frequency and Popularity Insights"
+version: 1.0.0
+doi: TODO
+date-released: TODO
diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py
index 05d7e29..d80c6d1 100644
--- a/src/process/twitter_visualization.py
+++ b/src/process/twitter_visualization.py
@@ -7,12 +7,14 @@ from typing import Optional
 
 import matplotlib.ticker
 import numpy as np
+import requests
 import scipy.signal
 from matplotlib import pyplot as plt, font_manager
 import matplotlib.dates as mdates
 from matplotlib import cm
 
 from process.twitter_process import *
+from raw_collect.others import get_covid_cases_us
 
 
 @dataclass()
@@ -177,7 +179,6 @@ class Sample:
         :return: None
         """
         self.dates = []
-        self.date_freqs = []
         self.date_pops = []
 
         # Average popularity ratio results over 7 days
@@ -187,12 +188,6 @@ class Sample:
         for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
             self.dates.append(dt)
 
-            # Convert date covid freq format
-            if ds in self.date_covid_freq:
-                self.date_freqs.append(self.date_covid_freq[ds])
-            else:
-                self.date_freqs.append(0)
-
             # Calculate date covid popularity ratio
             users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
                                   ds in self.user_date_covid_pop_avg[u]]
@@ -212,11 +207,16 @@ class Sample:
                 pops_i = user_pop_ratio_sum / seven_days_count
 
             # More than seven days, remove one
-            if len(seven_days_user_prs) == 7:
+            if len(seven_days_user_prs) == 20:
                 seven_days_user_prs.pop(0)
 
             self.date_pops.append(pops_i)
 
+        # Date frequencies
+        self.date_freqs = map_to_dates(self.date_covid_freq,
+                                       [x.isoformat()[:10] for x in self.dates])
+        self.date_freqs = filter_days_avg(self.date_freqs, 3)
+
 
 def load_samples() -> list[Sample]:
     """
@@ -390,7 +390,21 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],
             if len(labels) > i:
                 line.set_label(labels[i])
                 ax.legend()
-        if not freq:
+
+        # Plotting frequency, add in the COVID cases data
+        if freq:
+            cases = get_covid_cases_us()
+            c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
+            # c = scipy.signal.savgol_filter(c, 45, 2)
+            c = filter_days_avg(c, 7)
+            c = scipy.signal.lfilter([1.0 / n] * n, 1, c)
+
+            twin: plt.Axes = ax.twinx()
+            twin.plot(x, c, color='#d4b595', label='US COVID-19 Cases')
+            twin.set_ylim(bottom=0)
+
+        # Plotting popularity
+        else:
             ax.axhline(1, color=border_color)
             ax.set_ylim(0, 2)
 
diff --git a/src/raw_collect/others.py b/src/raw_collect/others.py
new file mode 100644
index 0000000..8a5fba0
--- /dev/null
+++ b/src/raw_collect/others.py
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+import requests
+
+
+@dataclass
+class CasesData:
+    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+    cases: dict[str, float]
+    deaths: dict[str, float]
+
+
+def get_covid_cases_us() -> CasesData:
+    """
+    Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
+
+    :return: Cases data
+    """
+    url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
+    csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
+    data = CasesData(dict(), dict())
+
+    # Parse CSV
+    for line in csv:
+        split = line.split(',')
+        day, cases, deaths = split[0], split[2], split[6]
+        data.cases[day] = float(cases)
+        data.deaths[day] = float(deaths)
+    return data
diff --git a/src/report/report_document.md b/src/report/report_document.md
index fe3ba89..7278fc0 100644
--- a/src/report/report_document.md
+++ b/src/report/report_document.md
@@ -117,7 +117,7 @@ We graphed the posting frequencies of our three samples in line graphs with the
     <div><img src="/change/freq/eng-news.png" alt="graph"></div>
 </div>
 
-For all three samples, the posting rates were almost zero during the first month when COVID-19 first started, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March 2020 to June 2020. After June 2020, 
+Looking at three graphs individually, the posting rates were almost zero during the first two month when COVID-19 first started for all three samples, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March to June 2020. After June 2020, the posting rate for `500-rand` started steadily declining, while the rate for `eng-news` stayed roughly level with a few peaks, and with `500-pop` having many peaks as high as the posting rate during from March to June 2020. In an effort to interpret the different peaks, we overlapped the three charts with the COVID-19 cases data from New York Times [[2]](#ref2), which gave us the following graph: 
 
 <div class="image-row">
     <div><img src="/change/comb/freq.png" alt="graph" class="large"></div>
@@ -125,6 +125,18 @@ For all three samples, the posting rates were almost zero during the first month
 
 For `500-rand` and `eng-nes`, 
 
+## Scratch pad (TODO)
+
+For the posting frequency, it is surprising that all three of our samples have posting frequencies dropped significantly after June 2020, as if everyone silently agreed to talk less about it.
+
+Possible reasons: 
+
+* There might not be as many "breaking news" or new information anymore, as three months is probably enough that everyone are aware of the virus.
+* People realized that COVID-19 is not going to be a disaster that fade away quickly and got used to it, so people paied less attention because our attention is very limited. (**TODO**: Possible psychological explanation? -- Look into how long people have paid attention to an unexpected disaster on average. Compare attention with historical disasters like 911?)
+* Chinese people might be relieved because new daily cases are already controlled to two digits since April.
+* Reopening in China around June.
+* ~~The first time cases seem to decline in U.S.~~ (Cases went back to increasing one month later but the posting frequency didn't go back up.)
+
 **_TODO_**
 
 ## References
@@ -135,3 +147,10 @@ For `500-rand` and `eng-nes`,
 Handle Outliers", _The ASQC Basic References in Quality Control:
 Statistical Techniques_, Edward F. Mykytka, Ph.D., Editor.
 
+<a id="ref2"></a>
+
+[2] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved November 27, 2021, from https://github.com/nytimes/covid-19-data.
+
+<a id="ref3"></a>
+
+[3] WHO. (n.d.) _Listings of WHO's Response to COVID-19._ World Health Organization. Retrieved November 27, 2021, from https://www.who.int/news/item/29-06-2020-covidtimeline.
diff --git a/src/utils.py b/src/utils.py
index c6a4af0..6c2adac 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -274,6 +274,43 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
         yield dt.strftime('%Y-%m-%d'), dt
 
 
+def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
+                 default: float = 0) -> list[float]:
+    """
+    Map y axis to date
+
+    Preconditions:
+      - The date in dates must be in the same format as the dates in the keys of y
+
+    :param y: Y axis data (in the format y[date] = value)
+    :param dates: Dates
+    :param default: Default data if y doesn't exist on that date
+    :return: A list of y data, one over each day in dates
+    """
+    return [y[d] if d in y else default for d in dates]
+
+
+def filter_days_avg(y: list[float], n: int) -> list[float]:
+    """
+    Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
+
+    Precondition:
+      - n % 2 == 1
+
+    :param y: Values
+    :param n: Number of days, must be odd
+    :return: Averaged data
+    """
+    results = []
+    buffer = []
+    for i in range(len(y)):
+        buffer.append(y[i])
+        results.append(sum(buffer) / len(buffer))
+        if len(buffer) > n:
+            buffer.pop(0)
+    return results
+
+
 def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
     """
     Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)