[U] Update code

This commit is contained in:
Hykilpikonna
2021-11-26 22:20:20 -05:00
parent 177b4aefec
commit 0f98b70509
5 changed files with 121 additions and 10 deletions
+12
View File
@@ -0,0 +1,12 @@
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: Gui
given-names: Azalea
orcid: https://orcid.org/0000-0002-6141-5926
- family-names: Lin
given-names: Peter
title: "COVID-19 Twitter Posting Frequency and Popularity Insights"
version: 1.0.0
doi: TODO
date-released: TODO
+23 -9
View File
@@ -7,12 +7,14 @@ from typing import Optional
import matplotlib.ticker
import numpy as np
import requests
import scipy.signal
from matplotlib import pyplot as plt, font_manager
import matplotlib.dates as mdates
from matplotlib import cm
from process.twitter_process import *
from raw_collect.others import get_covid_cases_us
@dataclass()
@@ -177,7 +179,6 @@ class Sample:
:return: None
"""
self.dates = []
self.date_freqs = []
self.date_pops = []
# Average popularity ratio results over 7 days
@@ -187,12 +188,6 @@ class Sample:
for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
self.dates.append(dt)
# Convert date covid freq format
if ds in self.date_covid_freq:
self.date_freqs.append(self.date_covid_freq[ds])
else:
self.date_freqs.append(0)
# Calculate date covid popularity ratio
users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
ds in self.user_date_covid_pop_avg[u]]
@@ -212,11 +207,16 @@ class Sample:
pops_i = user_pop_ratio_sum / seven_days_count
# More than seven days, remove one
if len(seven_days_user_prs) == 7:
if len(seven_days_user_prs) == 20:
seven_days_user_prs.pop(0)
self.date_pops.append(pops_i)
# Date frequencies
self.date_freqs = map_to_dates(self.date_covid_freq,
[x.isoformat()[:10] for x in self.dates])
self.date_freqs = filter_days_avg(self.date_freqs, 3)
def load_samples() -> list[Sample]:
"""
@@ -390,7 +390,21 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],
if len(labels) > i:
line.set_label(labels[i])
ax.legend()
if not freq:
# Plotting frequency, add in the COVID cases data
if freq:
cases = get_covid_cases_us()
c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
# c = scipy.signal.savgol_filter(c, 45, 2)
c = filter_days_avg(c, 7)
c = scipy.signal.lfilter([1.0 / n] * n, 1, c)
twin: plt.Axes = ax.twinx()
twin.plot(x, c, color='#d4b595', label='US COVID-19 Cases')
twin.set_ylim(bottom=0)
# Plotting popularity
else:
ax.axhline(1, color=border_color)
ax.set_ylim(0, 2)
+29
View File
@@ -0,0 +1,29 @@
from dataclasses import dataclass
import requests
@dataclass
class CasesData:
# cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
cases: dict[str, float]
deaths: dict[str, float]
def get_covid_cases_us() -> CasesData:
"""
Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
:return: Cases data
"""
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
data = CasesData(dict(), dict())
# Parse CSV
for line in csv:
split = line.split(',')
day, cases, deaths = split[0], split[2], split[6]
data.cases[day] = float(cases)
data.deaths[day] = float(deaths)
return data
+20 -1
View File
@@ -117,7 +117,7 @@ We graphed the posting frequencies of our three samples in line graphs with the
<div><img src="/change/freq/eng-news.png" alt="graph"></div>
</div>
For all three samples, the posting rates were almost zero during the first month when COVID-19 first started, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March 2020 to June 2020. After June 2020,
Looking at three graphs individually, the posting rates were almost zero during the first two month when COVID-19 first started for all three samples, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March to June 2020. After June 2020, the posting rate for `500-rand` started steadily declining, while the rate for `eng-news` stayed roughly level with a few peaks, and with `500-pop` having many peaks as high as the posting rate during from March to June 2020. In an effort to interpret the different peaks, we overlapped the three charts with the COVID-19 cases data from New York Times [[2]](#ref2), which gave us the following graph:
<div class="image-row">
<div><img src="/change/comb/freq.png" alt="graph" class="large"></div>
@@ -125,6 +125,18 @@ For all three samples, the posting rates were almost zero during the first month
For `500-rand` and `eng-nes`,
## Scratch pad (TODO)
For the posting frequency, it is surprising that all three of our samples have posting frequencies dropped significantly after June 2020, as if everyone silently agreed to talk less about it.
Possible reasons:
* There might not be as many "breaking news" or new information anymore, as three months is probably enough that everyone are aware of the virus.
* People realized that COVID-19 is not going to be a disaster that fade away quickly and got used to it, so people paied less attention because our attention is very limited. (**TODO**: Possible psychological explanation? -- Look into how long people have paid attention to an unexpected disaster on average. Compare attention with historical disasters like 911?)
* Chinese people might be relieved because new daily cases are already controlled to two digits since April.
* Reopening in China around June.
* ~~The first time cases seem to decline in U.S.~~ (Cases went back to increasing one month later but the posting frequency didn't go back up.)
**_TODO_**
## References
@@ -135,3 +147,10 @@ For `500-rand` and `eng-nes`,
Handle Outliers", _The ASQC Basic References in Quality Control:
Statistical Techniques_, Edward F. Mykytka, Ph.D., Editor.
<a id="ref2"></a>
[2] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved November 27, 2021, from https://github.com/nytimes/covid-19-data.
<a id="ref3"></a>
[3] WHO. (n.d.) _Listings of WHO's Response to COVID-19._ World Health Organization. Retrieved November 27, 2021, from https://www.who.int/news/item/29-06-2020-covidtimeline.
+37
View File
@@ -274,6 +274,43 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
yield dt.strftime('%Y-%m-%d'), dt
def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
default: float = 0) -> list[float]:
"""
Map y axis to date
Preconditions:
- The date in dates must be in the same format as the dates in the keys of y
:param y: Y axis data (in the format y[date] = value)
:param dates: Dates
:param default: Default data if y doesn't exist on that date
:return: A list of y data, one over each day in dates
"""
return [y[d] if d in y else default for d in dates]
def filter_days_avg(y: list[float], n: int) -> list[float]:
"""
Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
Precondition:
- n % 2 == 1
:param y: Values
:param n: Number of days, must be odd
:return: Averaged data
"""
results = []
buffer = []
for i in range(len(y)):
buffer.append(y[i])
results.append(sum(buffer) / len(buffer))
if len(buffer) > n:
buffer.pop(0)
return results
def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
"""
Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)