[U] Update formula
This commit is contained in:
@@ -32,11 +32,16 @@ class Sample:
|
||||
user_freqs: list[UserFloat]
|
||||
# Total popularity ratios of all posts for each user across all dates (sorted)
|
||||
user_pops: list[UserFloat]
|
||||
# Tweets by all users in a sample (always sorted by date)
|
||||
tweets: list[Posting]
|
||||
# Average popularity of all u's posts
|
||||
user_all_pop_avg: dict[str, float]
|
||||
# Average popularity of COVID tweets by a specific user on a specific date
|
||||
# user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date}
|
||||
user_date_covid_pop_avg: dict[str, dict[str, float]]
|
||||
# Total COVID-tweets frequency on a specific date for all users.
|
||||
date_covid_freq: dict[str, float]
|
||||
# dates[i] = The i-th day since the first tweet
|
||||
dates: list[datetime]
|
||||
# date_freqs[i] = Total frequency of all posts from all users in this sample on date[i]
|
||||
# date_freqs[i] = COVID frequency of all posts from all users in this sample on date[i]
|
||||
date_freqs: list[float]
|
||||
# date_pops[i] = Average popularity ratio of all posts from all users in this sample on date[i]
|
||||
date_pops: list[float]
|
||||
@@ -67,11 +72,17 @@ class Sample:
|
||||
|
||||
To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't
|
||||
post at all.
|
||||
|
||||
Precondition:
|
||||
- Downloaded tweets data are sorted by date
|
||||
"""
|
||||
debug(f'Calculating sample tweets data for {self.name}...')
|
||||
popularity = []
|
||||
frequency = []
|
||||
all_tweets: list[Posting] = []
|
||||
date_covid_count = dict()
|
||||
date_all_count = dict()
|
||||
self.user_all_pop_avg = dict()
|
||||
self.user_date_covid_pop_avg = dict()
|
||||
for i in range(len(self.users)):
|
||||
u = self.users[i]
|
||||
|
||||
@@ -81,45 +92,72 @@ class Sample:
|
||||
|
||||
# Load processed tweet
|
||||
tweets = load_tweets(u)
|
||||
# Ignore retweets
|
||||
tweets = [t for t in tweets if not t.repost]
|
||||
all_tweets += tweets
|
||||
# Ignore retweets, and ignore tweets that are earlier than the start of COVID
|
||||
tweets = [t for t in tweets if not t.repost and t.date > '2020-01-01T01:01:01']
|
||||
# Filter covid tweets
|
||||
covid = [t for t in tweets if t.covid_related]
|
||||
|
||||
# To prevent divide by zero, ignore people who didn't post at all
|
||||
if len(tweets) == 0:
|
||||
frequency.append(UserFloat(u, 0))
|
||||
continue
|
||||
# Calculate the frequency of COVID-related tweets
|
||||
freq = len(covid) / len(tweets)
|
||||
frequency.append(UserFloat(u, freq))
|
||||
|
||||
# Calculate date fields
|
||||
# Assume tweets are sorted
|
||||
# tweets.sort(key=lambda x: x.date)
|
||||
# Calculate popularity by date
|
||||
date_cp_sum = dict()
|
||||
date_cp_count = dict()
|
||||
for t in tweets:
|
||||
d = t.date[:10]
|
||||
|
||||
# For covid popularity on date
|
||||
if t.covid_related:
|
||||
if d not in date_cp_sum:
|
||||
date_cp_sum[d] = 0
|
||||
date_cp_count[d] = 0
|
||||
date_cp_sum[d] += t.popularity
|
||||
date_cp_count[d] += 1
|
||||
|
||||
# For frequency on date
|
||||
if d not in date_covid_count:
|
||||
date_covid_count[d] = 0
|
||||
date_all_count[d] = 0
|
||||
if t.covid_related:
|
||||
date_covid_count[d] += 1
|
||||
date_all_count[d] += 1
|
||||
|
||||
self.user_date_covid_pop_avg[u] = \
|
||||
{d: date_cp_sum[d] / date_cp_count[d] for d in date_cp_sum}
|
||||
|
||||
# Calculate total popularity ratio for a user
|
||||
# To prevent divide by zero, ignore everyone who didn't post about covid
|
||||
if len(covid) == 0:
|
||||
continue
|
||||
# Get the average popularity for COVID-related tweets
|
||||
covid_avg = sum(t.popularity for t in covid) / len(covid)
|
||||
global_avg = sum(t.popularity for t in tweets) / len(tweets)
|
||||
covid_pop_avg = sum(t.popularity for t in covid) / len(covid)
|
||||
all_pop_avg = sum(t.popularity for t in tweets) / len(tweets)
|
||||
# Save global_avg
|
||||
self.user_all_pop_avg[u] = all_pop_avg
|
||||
# To prevent divide by zero, ignore everyone who literally have no likes on any post
|
||||
if global_avg == 0:
|
||||
if all_pop_avg == 0:
|
||||
continue
|
||||
# Get the relative popularity
|
||||
popularity.append(UserFloat(u, covid_avg / global_avg))
|
||||
popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
|
||||
|
||||
# Calculate frequency on date
|
||||
self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
|
||||
|
||||
# Sort by relative popularity or frequency
|
||||
popularity.sort(key=lambda x: x.data, reverse=True)
|
||||
frequency.sort(key=lambda x: x.data, reverse=True)
|
||||
|
||||
# Sort by date, latest first
|
||||
all_tweets.sort(key=lambda x: x.date)
|
||||
|
||||
# Ignore tweets that are earlier than the start of COVID
|
||||
all_tweets = [t for t in all_tweets if t.date > '2020-01-01T01:01:01']
|
||||
|
||||
# Assign to sample
|
||||
self.user_freqs = frequency
|
||||
self.user_pops = popularity
|
||||
self.tweets = all_tweets
|
||||
debug('- Done.')
|
||||
|
||||
def calculate_change_data(self) -> None:
|
||||
@@ -136,41 +174,38 @@ class Sample:
|
||||
|
||||
:return: None
|
||||
"""
|
||||
# List indicies are days since the first tweet
|
||||
covid_count = [0]
|
||||
covid_popularity = [0]
|
||||
all_count = [0]
|
||||
all_popularity = [0]
|
||||
current_date = self.tweets[0][:10]
|
||||
i = 0
|
||||
self.dates = []
|
||||
self.date_freqs = []
|
||||
self.date_pops = []
|
||||
|
||||
# Loop through all tweets
|
||||
for tweet in self.tweets:
|
||||
# Move on to the next date
|
||||
tweet_date = tweet.date[:10]
|
||||
if tweet_date != current_date:
|
||||
current_date = tweet_date
|
||||
covid_count.append(0)
|
||||
covid_popularity.append(0)
|
||||
all_count.append(0)
|
||||
all_popularity.append(0)
|
||||
i += 1
|
||||
# Loop through all dates from the start of COVID to when the data is obtained
|
||||
for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
|
||||
self.dates.append(dt)
|
||||
|
||||
# Add current tweet data
|
||||
all_count[i] += 1
|
||||
all_popularity[i] += tweet.popularity
|
||||
if tweet.covid_related:
|
||||
covid_count[i] += 1
|
||||
covid_popularity[i] += tweet.popularity
|
||||
# Convert date covid freq format
|
||||
if ds in self.date_covid_freq:
|
||||
self.date_freqs.append(self.date_covid_freq[ds])
|
||||
else:
|
||||
self.date_freqs.append(0)
|
||||
|
||||
# Calculate frequency and popularity ratio for each date, which will be our y-axis
|
||||
self.date_freqs = divide_zeros(covid_count, all_count)
|
||||
self.date_pops = divide_zeros(divide_zeros(covid_popularity, covid_count),
|
||||
divide_zeros(all_popularity, all_count))
|
||||
# Calculate date covid popularity ratio
|
||||
users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
|
||||
ds in self.user_date_covid_pop_avg[u]]
|
||||
if len(users_posted_today) != 0:
|
||||
user_pop_ratio_sum = sum(self.user_date_covid_pop_avg[u][ds] /
|
||||
self.user_all_pop_avg[u] for u in users_posted_today
|
||||
if self.user_all_pop_avg[u] != 0)
|
||||
pops_i = user_pop_ratio_sum / len(users_posted_today)
|
||||
|
||||
# Convert indicies to dates, which will be our x-axis
|
||||
first_date = parse_date(self.tweets[0].date).replace(hour=0, minute=0, second=0)
|
||||
self.dates = [first_date + timedelta(days=j) for j in range(len(all_count))]
|
||||
if pops_i > 20:
|
||||
print('Date: ', ds)
|
||||
for u in users_posted_today:
|
||||
if self.user_all_pop_avg[u] != 0:
|
||||
print('-', u, self.user_date_covid_pop_avg[u][ds] /
|
||||
self.user_all_pop_avg[u])
|
||||
else:
|
||||
pops_i = 1
|
||||
self.date_pops.append(pops_i)
|
||||
|
||||
|
||||
def load_samples() -> list[Sample]:
|
||||
@@ -215,7 +250,7 @@ def report_ignored(samples: list[Sample]) -> None:
|
||||
:return: None
|
||||
"""
|
||||
# For frequencies, report who didn't post
|
||||
table = [["Total users"] + [str(len(s.user_freqs)) for s in samples],
|
||||
table = [["Total users"] + [str(len(s.users)) for s in samples],
|
||||
["Users who didn't post at all"] +
|
||||
[str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples],
|
||||
["Users who posted less than 1%"] +
|
||||
@@ -390,16 +425,6 @@ def report_stats(samples: list[Sample]) -> None:
|
||||
Reporter('freq/stats.md').table(table, [s.name for s in samples], True)
|
||||
|
||||
|
||||
def view_covid_tweets_date(tweets: list[Posting]):
|
||||
# Graph histogram
|
||||
plt.title(f'COVID posting dates')
|
||||
plt.xticks(rotation=45)
|
||||
plt.yticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.hist([parse_date(t.date) for t in tweets if t.covid_related], bins=40, color='#ffcccc')
|
||||
plt.show()
|
||||
|
||||
|
||||
def report_change_different_n(sample: Sample) -> None:
|
||||
"""
|
||||
Experiment wth different n values for IIR filter
|
||||
@@ -420,6 +445,7 @@ def report_change_graphs(sample: Sample) -> None:
|
||||
graph_line_plot(sample.dates, sample.date_freqs, f'change/freq/{sample.name}.png',
|
||||
f'COVID-posting frequency over time for {sample.name} IIR(10)',
|
||||
True, 10)
|
||||
print(sum(sample.date_pops) / len(sample.dates))
|
||||
|
||||
|
||||
def report_all() -> None:
|
||||
|
||||
@@ -12,7 +12,7 @@ Our data come from three samples:
|
||||
|
||||
## COVID-19 Posting Frequency
|
||||
|
||||
First, we analyzed how frequently the users in these three datasets are posing about COVID-19. Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19:
|
||||
First, we analyzed how frequently the users in these three datasets are posing about COVID-19 (ignoring retweets). Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19:
|
||||
|
||||
@include `/freq/didnt-post.md`
|
||||
|
||||
@@ -80,7 +80,7 @@ $$ \text{freqs}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on
|
||||
</blockquote>
|
||||
|
||||
<blockquote>
|
||||
$$ \text{pops}_i = \left(\frac{\sum\text{Popularity of COVID-posts on date}_i}{|\text{COVID-posts on date}_i|}\right) / \left(\frac{\sum \text{Popularity of all posts on date}_i}{|\text{All posts on date}_i|}\right) $$
|
||||
$$ \text{pops}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{Popularity of u's COVID-posts on date}_i}{(\text{Average popularity of all u's posts}) \cdot |\text{u's COVID-posts on date}_i|}\right)}{(\text{Number of users posted on date}_i)} $$
|
||||
</blockquote>
|
||||
|
||||
After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. When we plot the graph without a filter, we found that the graph is actually very noisy as shown in the first graph below. So, we experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
|
||||
|
||||
+31
-3
@@ -4,9 +4,9 @@ import json
|
||||
import os
|
||||
import statistics
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Union, NamedTuple, Any
|
||||
from typing import Union, NamedTuple, Any, Generator
|
||||
|
||||
import json5
|
||||
import numpy as np
|
||||
@@ -230,7 +230,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
|
||||
]
|
||||
|
||||
|
||||
def parse_date(iso: str) -> datetime:
|
||||
def parse_date_time(iso: str) -> datetime:
|
||||
"""
|
||||
Parse date faster. Running 1,000,000 trials, this parse_date function is 4.03 times faster than
|
||||
python's built-in dateutil.parser.isoparse() function.
|
||||
@@ -246,6 +246,34 @@ def parse_date(iso: str) -> datetime:
|
||||
int(iso[11:13]), int(iso[14:16]), int(iso[17:19]))
|
||||
|
||||
|
||||
def parse_date_only(iso: str) -> datetime:
|
||||
"""
|
||||
Parse date faster.
|
||||
|
||||
Preconditions:
|
||||
- iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20")
|
||||
- iso is a valid date (this function does not check for the validity of the input)
|
||||
|
||||
:param iso: Input date
|
||||
:return: Datetime object
|
||||
"""
|
||||
return datetime(int(iso[:4]), int(iso[5:7]), int(iso[8:10]))
|
||||
|
||||
|
||||
def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], None, None]:
|
||||
"""
|
||||
Date range for looping
|
||||
|
||||
:param start_date: Start date in "YYYY-MM-DD" format
|
||||
:param end_date: End date in "YYYY-MM-DD" format
|
||||
:return: Generator for looping through the dates one day at a time.
|
||||
"""
|
||||
start = parse_date_only(start_date)
|
||||
for n in range(int((parse_date_only(end_date) - start).days)):
|
||||
dt = start + timedelta(n)
|
||||
yield dt.strftime('%Y-%m-%d'), dt
|
||||
|
||||
|
||||
def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
|
||||
"""
|
||||
Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)
|
||||
|
||||
Reference in New Issue
Block a user