[U] Update code

2021-11-26 22:20:20 -05:00
parent 177b4aefec
commit 0f98b70509
5 changed files with 121 additions and 10 deletions
@@ -0,0 +1,12 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: Gui
+    given-names: Azalea
+    orcid: https://orcid.org/0000-0002-6141-5926
+  - family-names: Lin
+    given-names: Peter
+title: "COVID-19 Twitter Posting Frequency and Popularity Insights"
+version: 1.0.0
+doi: TODO
+date-released: TODO
@@ -7,12 +7,14 @@ from typing import Optional

 import matplotlib.ticker
 import numpy as np
+import requests
 import scipy.signal
 from matplotlib import pyplot as plt, font_manager
 import matplotlib.dates as mdates
 from matplotlib import cm

 from process.twitter_process import *
+from raw_collect.others import get_covid_cases_us


@dataclass()
@@ -177,7 +179,6 @@ class Sample:
        :return: None
        """
        self.dates = []
-        self.date_freqs = []
        self.date_pops = []

        # Average popularity ratio results over 7 days
@@ -187,12 +188,6 @@ class Sample:
        for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
            self.dates.append(dt)

-            # Convert date covid freq format
-            if ds in self.date_covid_freq:
-                self.date_freqs.append(self.date_covid_freq[ds])
-            else:
-                self.date_freqs.append(0)
-
            # Calculate date covid popularity ratio
            users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
                                  ds in self.user_date_covid_pop_avg[u]]
@@ -212,11 +207,16 @@ class Sample:
                pops_i = user_pop_ratio_sum / seven_days_count

            # More than seven days, remove one
-            if len(seven_days_user_prs) == 7:
+            if len(seven_days_user_prs) == 20:
                seven_days_user_prs.pop(0)

            self.date_pops.append(pops_i)

+        # Date frequencies
+        self.date_freqs = map_to_dates(self.date_covid_freq,
+                                       [x.isoformat()[:10] for x in self.dates])
+        self.date_freqs = filter_days_avg(self.date_freqs, 3)
+

 def load_samples() -> list[Sample]:
    """
@@ -390,7 +390,21 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],
            if len(labels) > i:
                line.set_label(labels[i])
                ax.legend()
-        if not freq:
+
+        # Plotting frequency, add in the COVID cases data
+        if freq:
+            cases = get_covid_cases_us()
+            c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
+            # c = scipy.signal.savgol_filter(c, 45, 2)
+            c = filter_days_avg(c, 7)
+            c = scipy.signal.lfilter([1.0 / n] * n, 1, c)
+
+            twin: plt.Axes = ax.twinx()
+            twin.plot(x, c, color='#d4b595', label='US COVID-19 Cases')
+            twin.set_ylim(bottom=0)
+
+        # Plotting popularity
+        else:
            ax.axhline(1, color=border_color)
            ax.set_ylim(0, 2)

@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+import requests
+
+
+@dataclass
+class CasesData:
+    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+    cases: dict[str, float]
+    deaths: dict[str, float]
+
+
+def get_covid_cases_us() -> CasesData:
+    """
+    Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
+
+    :return: Cases data
+    """
+    url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
+    csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
+    data = CasesData(dict(), dict())
+
+    # Parse CSV
+    for line in csv:
+        split = line.split(',')
+        day, cases, deaths = split[0], split[2], split[6]
+        data.cases[day] = float(cases)
+        data.deaths[day] = float(deaths)
+    return data
@@ -117,7 +117,7 @@ We graphed the posting frequencies of our three samples in line graphs with the
    <div><img src="/change/freq/eng-news.png" alt="graph"></div>
 </div>

-For all three samples, the posting rates were almost zero during the first month when COVID-19 first started, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March 2020 to June 2020. After June 2020, 
+Looking at three graphs individually, the posting rates were almost zero during the first two month when COVID-19 first started for all three samples, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March to June 2020. After June 2020, the posting rate for `500-rand` started steadily declining, while the rate for `eng-news` stayed roughly level with a few peaks, and with `500-pop` having many peaks as high as the posting rate during from March to June 2020. In an effort to interpret the different peaks, we overlapped the three charts with the COVID-19 cases data from New York Times [[2]](#ref2), which gave us the following graph: 

 <div class="image-row">
    <div><img src="/change/comb/freq.png" alt="graph" class="large"></div>
@@ -125,6 +125,18 @@ For all three samples, the posting rates were almost zero during the first month

 For `500-rand` and `eng-nes`, 

+## Scratch pad (TODO)
+
+For the posting frequency, it is surprising that all three of our samples have posting frequencies dropped significantly after June 2020, as if everyone silently agreed to talk less about it.
+
+Possible reasons: 
+
+* There might not be as many "breaking news" or new information anymore, as three months is probably enough that everyone are aware of the virus.
+* People realized that COVID-19 is not going to be a disaster that fade away quickly and got used to it, so people paied less attention because our attention is very limited. (**TODO**: Possible psychological explanation? -- Look into how long people have paid attention to an unexpected disaster on average. Compare attention with historical disasters like 911?)
+* Chinese people might be relieved because new daily cases are already controlled to two digits since April.
+* Reopening in China around June.
+* ~~The first time cases seem to decline in U.S.~~ (Cases went back to increasing one month later but the posting frequency didn't go back up.)
+
 **_TODO_**

 ## References
@@ -135,3 +147,10 @@ For `500-rand` and `eng-nes`,
 Handle Outliers", _The ASQC Basic References in Quality Control:
 Statistical Techniques_, Edward F. Mykytka, Ph.D., Editor.

+<a id="ref2"></a>
+
+[2] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved November 27, 2021, from https://github.com/nytimes/covid-19-data.
+
+<a id="ref3"></a>
+
+[3] WHO. (n.d.) _Listings of WHO's Response to COVID-19._ World Health Organization. Retrieved November 27, 2021, from https://www.who.int/news/item/29-06-2020-covidtimeline.
@@ -274,6 +274,43 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
        yield dt.strftime('%Y-%m-%d'), dt


+def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
+                 default: float = 0) -> list[float]:
+    """
+    Map y axis to date
+
+    Preconditions:
+      - The date in dates must be in the same format as the dates in the keys of y
+
+    :param y: Y axis data (in the format y[date] = value)
+    :param dates: Dates
+    :param default: Default data if y doesn't exist on that date
+    :return: A list of y data, one over each day in dates
+    """
+    return [y[d] if d in y else default for d in dates]
+
+
+def filter_days_avg(y: list[float], n: int) -> list[float]:
+    """
+    Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
+
+    Precondition:
+      - n % 2 == 1
+
+    :param y: Values
+    :param n: Number of days, must be odd
+    :return: Averaged data
+    """
+    results = []
+    buffer = []
+    for i in range(len(y)):
+        buffer.append(y[i])
+        results.append(sum(buffer) / len(buffer))
+        if len(buffer) > n:
+            buffer.pop(0)
+    return results
+
+
 def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
    """
    Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)