diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py index a52a3f9..231c6dc 100644 --- a/src/process/twitter_visualization.py +++ b/src/process/twitter_visualization.py @@ -32,11 +32,16 @@ class Sample: user_freqs: list[UserFloat] # Total popularity ratios of all posts for each user across all dates (sorted) user_pops: list[UserFloat] - # Tweets by all users in a sample (always sorted by date) - tweets: list[Posting] + # Average popularity of all u's posts + user_all_pop_avg: dict[str, float] + # Average popularity of COVID tweets by a specific user on a specific date + # user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date} + user_date_covid_pop_avg: dict[str, dict[str, float]] + # Total COVID-tweets frequency on a specific date for all users. + date_covid_freq: dict[str, float] # dates[i] = The i-th day since the first tweet dates: list[datetime] - # date_freqs[i] = Total frequency of all posts from all users in this sample on date[i] + # date_freqs[i] = COVID frequency of all posts from all users in this sample on date[i] date_freqs: list[float] # date_pops[i] = Average popularity ratio of all posts from all users in this sample on date[i] date_pops: list[float] @@ -67,11 +72,17 @@ class Sample: To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't post at all. + + Precondition: + - Downloaded tweets data are sorted by date """ debug(f'Calculating sample tweets data for {self.name}...') popularity = [] frequency = [] - all_tweets: list[Posting] = [] + date_covid_count = dict() + date_all_count = dict() + self.user_all_pop_avg = dict() + self.user_date_covid_pop_avg = dict() for i in range(len(self.users)): u = self.users[i] @@ -81,45 +92,72 @@ class Sample: # Load processed tweet tweets = load_tweets(u) - # Ignore retweets - tweets = [t for t in tweets if not t.repost] - all_tweets += tweets + # Ignore retweets, and ignore tweets that are earlier than the start of COVID + tweets = [t for t in tweets if not t.repost and t.date > '2020-01-01T01:01:01'] # Filter covid tweets covid = [t for t in tweets if t.covid_related] # To prevent divide by zero, ignore people who didn't post at all if len(tweets) == 0: + frequency.append(UserFloat(u, 0)) continue # Calculate the frequency of COVID-related tweets freq = len(covid) / len(tweets) frequency.append(UserFloat(u, freq)) + # Calculate date fields + # Assume tweets are sorted + # tweets.sort(key=lambda x: x.date) + # Calculate popularity by date + date_cp_sum = dict() + date_cp_count = dict() + for t in tweets: + d = t.date[:10] + + # For covid popularity on date + if t.covid_related: + if d not in date_cp_sum: + date_cp_sum[d] = 0 + date_cp_count[d] = 0 + date_cp_sum[d] += t.popularity + date_cp_count[d] += 1 + + # For frequency on date + if d not in date_covid_count: + date_covid_count[d] = 0 + date_all_count[d] = 0 + if t.covid_related: + date_covid_count[d] += 1 + date_all_count[d] += 1 + + self.user_date_covid_pop_avg[u] = \ + {d: date_cp_sum[d] / date_cp_count[d] for d in date_cp_sum} + + # Calculate total popularity ratio for a user # To prevent divide by zero, ignore everyone who didn't post about covid if len(covid) == 0: continue # Get the average popularity for COVID-related tweets - covid_avg = sum(t.popularity for t in covid) / len(covid) - global_avg = sum(t.popularity for t in tweets) / len(tweets) + covid_pop_avg = sum(t.popularity for t in covid) / len(covid) + all_pop_avg = sum(t.popularity for t in tweets) / len(tweets) + # Save global_avg + self.user_all_pop_avg[u] = all_pop_avg # To prevent divide by zero, ignore everyone who literally have no likes on any post - if global_avg == 0: + if all_pop_avg == 0: continue # Get the relative popularity - popularity.append(UserFloat(u, covid_avg / global_avg)) + popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg)) + + # Calculate frequency on date + self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count} # Sort by relative popularity or frequency popularity.sort(key=lambda x: x.data, reverse=True) frequency.sort(key=lambda x: x.data, reverse=True) - # Sort by date, latest first - all_tweets.sort(key=lambda x: x.date) - - # Ignore tweets that are earlier than the start of COVID - all_tweets = [t for t in all_tweets if t.date > '2020-01-01T01:01:01'] - # Assign to sample self.user_freqs = frequency self.user_pops = popularity - self.tweets = all_tweets debug('- Done.') def calculate_change_data(self) -> None: @@ -136,41 +174,38 @@ class Sample: :return: None """ - # List indicies are days since the first tweet - covid_count = [0] - covid_popularity = [0] - all_count = [0] - all_popularity = [0] - current_date = self.tweets[0][:10] - i = 0 + self.dates = [] + self.date_freqs = [] + self.date_pops = [] - # Loop through all tweets - for tweet in self.tweets: - # Move on to the next date - tweet_date = tweet.date[:10] - if tweet_date != current_date: - current_date = tweet_date - covid_count.append(0) - covid_popularity.append(0) - all_count.append(0) - all_popularity.append(0) - i += 1 + # Loop through all dates from the start of COVID to when the data is obtained + for (ds, dt) in daterange('2020-01-01', '2021-11-25'): + self.dates.append(dt) - # Add current tweet data - all_count[i] += 1 - all_popularity[i] += tweet.popularity - if tweet.covid_related: - covid_count[i] += 1 - covid_popularity[i] += tweet.popularity + # Convert date covid freq format + if ds in self.date_covid_freq: + self.date_freqs.append(self.date_covid_freq[ds]) + else: + self.date_freqs.append(0) - # Calculate frequency and popularity ratio for each date, which will be our y-axis - self.date_freqs = divide_zeros(covid_count, all_count) - self.date_pops = divide_zeros(divide_zeros(covid_popularity, covid_count), - divide_zeros(all_popularity, all_count)) + # Calculate date covid popularity ratio + users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and + ds in self.user_date_covid_pop_avg[u]] + if len(users_posted_today) != 0: + user_pop_ratio_sum = sum(self.user_date_covid_pop_avg[u][ds] / + self.user_all_pop_avg[u] for u in users_posted_today + if self.user_all_pop_avg[u] != 0) + pops_i = user_pop_ratio_sum / len(users_posted_today) - # Convert indicies to dates, which will be our x-axis - first_date = parse_date(self.tweets[0].date).replace(hour=0, minute=0, second=0) - self.dates = [first_date + timedelta(days=j) for j in range(len(all_count))] + if pops_i > 20: + print('Date: ', ds) + for u in users_posted_today: + if self.user_all_pop_avg[u] != 0: + print('-', u, self.user_date_covid_pop_avg[u][ds] / + self.user_all_pop_avg[u]) + else: + pops_i = 1 + self.date_pops.append(pops_i) def load_samples() -> list[Sample]: @@ -215,7 +250,7 @@ def report_ignored(samples: list[Sample]) -> None: :return: None """ # For frequencies, report who didn't post - table = [["Total users"] + [str(len(s.user_freqs)) for s in samples], + table = [["Total users"] + [str(len(s.users)) for s in samples], ["Users who didn't post at all"] + [str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples], ["Users who posted less than 1%"] + @@ -390,16 +425,6 @@ def report_stats(samples: list[Sample]) -> None: Reporter('freq/stats.md').table(table, [s.name for s in samples], True) -def view_covid_tweets_date(tweets: list[Posting]): - # Graph histogram - plt.title(f'COVID posting dates') - plt.xticks(rotation=45) - plt.yticks(rotation=45) - plt.tight_layout() - plt.hist([parse_date(t.date) for t in tweets if t.covid_related], bins=40, color='#ffcccc') - plt.show() - - def report_change_different_n(sample: Sample) -> None: """ Experiment wth different n values for IIR filter @@ -420,6 +445,7 @@ def report_change_graphs(sample: Sample) -> None: graph_line_plot(sample.dates, sample.date_freqs, f'change/freq/{sample.name}.png', f'COVID-posting frequency over time for {sample.name} IIR(10)', True, 10) + print(sum(sample.date_pops) / len(sample.dates)) def report_all() -> None: diff --git a/src/report/report_document.md b/src/report/report_document.md index e69f948..558dba0 100644 --- a/src/report/report_document.md +++ b/src/report/report_document.md @@ -12,7 +12,7 @@ Our data come from three samples: ## COVID-19 Posting Frequency -First, we analyzed how frequently the users in these three datasets are posing about COVID-19. Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19: +First, we analyzed how frequently the users in these three datasets are posing about COVID-19 (ignoring retweets). Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19: @include `/freq/didnt-post.md` @@ -80,7 +80,7 @@ $$ \text{freqs}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on
-$$ \text{pops}_i = \left(\frac{\sum\text{Popularity of COVID-posts on date}_i}{|\text{COVID-posts on date}_i|}\right) / \left(\frac{\sum \text{Popularity of all posts on date}_i}{|\text{All posts on date}_i|}\right) $$ +$$ \text{pops}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{Popularity of u's COVID-posts on date}_i}{(\text{Average popularity of all u's posts}) \cdot |\text{u's COVID-posts on date}_i|}\right)}{(\text{Number of users posted on date}_i)} $$
After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. When we plot the graph without a filter, we found that the graph is actually very noisy as shown in the first graph below. So, we experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`. diff --git a/src/utils.py b/src/utils.py index 371277e..c6a4af0 100644 --- a/src/utils.py +++ b/src/utils.py @@ -4,9 +4,9 @@ import json import os import statistics from dataclasses import dataclass -from datetime import datetime, date +from datetime import datetime, date, timedelta from pathlib import Path -from typing import Union, NamedTuple, Any +from typing import Union, NamedTuple, Any, Generator import json5 import numpy as np @@ -230,7 +230,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]] ] -def parse_date(iso: str) -> datetime: +def parse_date_time(iso: str) -> datetime: """ Parse date faster. Running 1,000,000 trials, this parse_date function is 4.03 times faster than python's built-in dateutil.parser.isoparse() function. @@ -246,6 +246,34 @@ def parse_date(iso: str) -> datetime: int(iso[11:13]), int(iso[14:16]), int(iso[17:19])) +def parse_date_only(iso: str) -> datetime: + """ + Parse date faster. + + Preconditions: + - iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20") + - iso is a valid date (this function does not check for the validity of the input) + + :param iso: Input date + :return: Datetime object + """ + return datetime(int(iso[:4]), int(iso[5:7]), int(iso[8:10])) + + +def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], None, None]: + """ + Date range for looping + + :param start_date: Start date in "YYYY-MM-DD" format + :param end_date: End date in "YYYY-MM-DD" format + :return: Generator for looping through the dates one day at a time. + """ + start = parse_date_only(start_date) + for n in range(int((parse_date_only(end_date) - start).days)): + dt = start + timedelta(n) + yield dt.strftime('%Y-%m-%d'), dt + + def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]: """ Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)