[+] Update code

This commit is contained in:
Hykilpikonna
2021-11-26 18:43:19 -05:00
parent 7a5fb3b71e
commit 4d831eaba0
5 changed files with 87 additions and 38 deletions
+50 -34
View File
@@ -178,6 +178,9 @@ class Sample:
self.date_freqs = []
self.date_pops = []
# Average popularity ratio results over 7 days
seven_days_user_prs = []
# Loop through all dates from the start of COVID to when the data is obtained
for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
self.dates.append(dt)
@@ -191,20 +194,25 @@ class Sample:
# Calculate date covid popularity ratio
users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
ds in self.user_date_covid_pop_avg[u]]
if len(users_posted_today) != 0:
user_pop_ratio_sum = sum(self.user_date_covid_pop_avg[u][ds] /
self.user_all_pop_avg[u] for u in users_posted_today
if self.user_all_pop_avg[u] != 0)
pops_i = user_pop_ratio_sum / len(users_posted_today)
if pops_i > 20:
print('Date: ', ds)
for u in users_posted_today:
if self.user_all_pop_avg[u] != 0:
print('-', u, self.user_date_covid_pop_avg[u][ds] /
self.user_all_pop_avg[u])
if len(users_posted_today) == 0:
seven_days_user_prs.append([])
else:
user_prs = [self.user_date_covid_pop_avg[u][ds] / self.user_all_pop_avg[u]
for u in users_posted_today if self.user_all_pop_avg[u] != 0]
seven_days_user_prs.append(user_prs)
# Average over seven days
seven_days_count = sum(len(user_prs) for user_prs in seven_days_user_prs)
if seven_days_count == 0:
pops_i = 1
else:
user_pop_ratio_sum = sum(sum(user_prs) for user_prs in seven_days_user_prs)
pops_i = user_pop_ratio_sum / seven_days_count
# More than seven days, remove one
if len(seven_days_user_prs) == 7:
seven_days_user_prs.pop(0)
self.date_pops.append(pops_i)
@@ -318,8 +326,8 @@ def graph_histogram(x: list[float], path: str, title: str, clear_outliers: bool
plt.close(fig)
def graph_line_plot(x: list[datetime], y: list[float], path: str, title: str, freq: bool,
n: int = 0) -> None:
def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]], path: str,
title: str, freq: bool, n: int = 0) -> None:
"""
Plot a line plot, and reduce noise using an IIR filter
@@ -347,34 +355,37 @@ def graph_line_plot(x: list[datetime], y: list[float], path: str, title: str, fr
# Date format
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d\n%Y'))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m\n%Y'))
ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
# Plot
ax.set_title(title, color=border_color)
ax.plot(x, y, color='#d4b595')
if freq:
# Color below curve
ax.fill_between(x, y, color='#d4b595')
# Plotting single data line
if isinstance(y[0], float):
ax.plot(x, y, color='#d4b595')
if freq:
# Color below curve
ax.fill_between(x, y, color='#d4b595')
else:
ax.axhline(1, color=border_color)
ax.set_ylim(0, 2)
# Plotting multiple data lines
else:
ax.axhline(1, color=border_color)
# # Color by y-value
# upper = 1.5
# lower = 0.5
#
# y = np.array(y)
# y_up = np.ma.masked_where(y < upper, y)
# y_low = np.ma.masked_where(y > lower, y)
# y_middle = np.ma.masked_where((y < lower) | (y > upper), y)
#
# ax.plot(x, y_up, color='green')
# ax.plot(x, y_middle, color='yellow')
# ax.plot(x, y_low, color='red')
fig.set_size_inches(16, 9)
for y in y:
ax.plot(x, y)
if not freq:
ax.axhline(1, color=border_color)
ax.set_ylim(0, 2)
# Colors
ax.tick_params(color=border_color, labelcolor=border_color)
ax.tick_params(which='minor', color='#9d5800')
for spine in ax.spines.values():
spine.set_edgecolor(border_color)
@@ -432,7 +443,7 @@ def report_change_different_n(sample: Sample) -> None:
:param sample: Sample
:return: None
"""
for n in range(1, 15, 3):
for n in range(5, 16, 5):
graph_line_plot(sample.dates, sample.date_pops, f'change/n/{n}.png',
f'COVID-posting popularity ratio over time for {sample.name} IIR(n={n})',
False, n)
@@ -471,6 +482,11 @@ def report_all() -> None:
report_change_graphs(s)
report_change_different_n(samples[0])
graph_line_plot(samples[0].dates, [s.date_pops for s in samples], 'change/comb/pop.png',
'COVID-posting popularity ratio over time for all samples - IIR(10)', False, 10)
graph_line_plot(samples[0].dates, [s.date_freqs for s in samples], 'change/comb/freq.png',
'COVID-posting frequency over time for all samples - IIR(10)', True, 10)
if __name__ == '__main__':
report_all()
+33 -4
View File
@@ -83,14 +83,43 @@ $$ \text{freqs}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on
$$ \text{pops}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{Popularity of u's COVID-posts on date}_i}{(\text{Average popularity of all u's posts}) \cdot |\text{u's COVID-posts on date}_i|}\right)}{(\text{Number of users posted on date}_i)} $$
</blockquote>
After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. When we plot the graph without a filter, we found that the graph is actually very noisy as shown in the first graph below. So, we experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. Initially, we are seeing graphs with very high peaks such as the graph below. After some investigation, we found that these peaks are caused by not having enough tweets on each day to average out the random error of one single popular tweet. For example, in the graph below, we adjusted the program to print different users' popularity ratios when we found an average popularity ratio of greater than 20, which produced the output on the right. As it turns out, on 2020-07-11, the user @juniorbachchan posted that he and his father tested positive, and that single post is 163.84 times more popular than the average of all his posts. (The post is linked [here](https://twitter.com/juniorbachchan/status/1282018653215395840), it has 235k likes, 25k comments, and 32k retweets). Even though these data points are outliers, there isn't an effective way of removing them since we don't have enough tweets data from each user to calculate their range (for example, someone's COVID-related post might be the only one they've posted). So, we've decided to limit the viewing window to `y = [0, 2]` as shown in the graph on the right.
<div class="image-row">
<div><img src="/change/n/1.png" alt="hist"></div>
<div><img src="/change/n/4.png" alt="hist"></div>
<div><img src="/change/n/10.png" alt="hist"></div>
<div><img src="resources/peak-1.png" alt="graph"></div>
<div style="display: flex; flex-direction: column; justify-content: center"><pre>
Date: 2020-07-11
- JoeBiden 1.36
<span class="highlight">- juniorbachchan 163.84</span>
- victoriabeckham 0.80
- anandmahindra 7.66
- gucci 0.13
- StephenKing 0.61
</pre></div>
<div><img src="resources/peak-2.png" alt="graph"></div>
</div>
Then, we encountered the issue of noise. When we plot the graph without a filter, we found that the graph is actually very noisy. We decided to average the results over 7 days. Then, we also experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
<div class="image-row">
<div><img src="/change/n/5.png" alt="graph"></div>
<div><img src="/change/n/10.png" alt="graph"></div>
<div><img src="/change/n/15.png" alt="graph"></div>
</div>
## Results - Posting Frequency
We graphed the posting frequencies of our three samples in line graphs with the x-axis being the date, which gave us the following graphs:
<div class="image-row">
<div><img src="/change/freq/500-pop.png" alt="graph"></div>
<div><img src="/change/freq/500-rand.png" alt="graph"></div>
<div><img src="/change/freq/eng-news.png" alt="graph"></div>
</div>
For all three samples, the posting rates were almost zero during the first month when COVID-19 first started, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from
For `500-rand` and `eng-nes`,
**_TODO_**
Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

+4
View File
@@ -30,6 +30,10 @@ a {
text-decoration: none;
}
span.highlight {
background-color: beige;
}
table {
border-collapse: collapse;
border: 2px solid #5b3300;