[O] Change analysis structure

This commit is contained in:
Hykilpikonna
2021-11-24 21:28:57 -05:00
parent 44f3dcb9d2
commit 3c2ad5462a
+71 -32
View File
@@ -3,6 +3,7 @@ TODO: Module Docstring
"""
import statistics
from typing import Any
from dataclasses import dataclass, field
from matplotlib import pyplot as plt
from tabulate import tabulate
@@ -39,6 +40,28 @@ class Reporter:
write(self.file, self.report)
@dataclass()
class UserFloat:
"""
Model for which a floating point data is assigned to each user
This is used for both COVID tweet frequency and popularity ratio data, because both of these
are floating point data.
"""
username: str
data: float
@dataclass()
class UserSample:
name: str
users: list[str]
frequencies: list[UserFloat] = field(default_factory=list)
popularity_ratios: list[UserFloat] = field(default_factory=list)
# Tweets by all users in a sample
tweets: list[Posting] = field(default_factory=list)
def view_covid_tweets_freq(users: list[str],
sample_name: str) -> None:
"""
@@ -50,33 +73,21 @@ def view_covid_tweets_freq(users: list[str],
:param sample_name: Name of the sample
:return: None
"""
# Load tweets, and get the frequency of covid tweets for each user
user_frequency = []
for u in users:
# Load processed tweet
tweets = load_tweets(u)
# Get the frequency of COVID-related tweets
freq = len([1 for t in tweets if t.covid_related]) / len(tweets)
user_frequency.append((u, freq))
# Sort by frequency
user_frequency.sort(key=lambda x: x[1], reverse=True)
# Init reporter
r = Reporter(f'{REPORT_DIR}/1-covid-tweet-frequency/{sample_name}.md')
r = Reporter(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.md')
r.print(f"In {sample_name} -")
# How many people didn't post about COVID?
r.print("How many people didn't post about COVID:",
len([a for a in user_frequency if a[1] == 0]))
r.print("How many people have less than 1% of their posts about COVID:",
len([a for a in user_frequency if a[1] <= 0.01]))
len([a for a in user_frequency if a[1] <= 0.01]))
r.print()
# Top 20
r.print(f"20 Users of who post COVID-related tweets most frequently:")
r.print(tabulate([[u[0], f'{u[1] * 100:.1f}%'] for u in user_frequency[:20]],
['Username', 'Frequency'], tablefmt="github"))
['Username', 'Frequency'], tablefmt="github"))
# Save report
r.save()
@@ -86,7 +97,7 @@ def view_covid_tweets_freq(users: list[str],
plt.xticks(rotation=90)
plt.tight_layout()
plt.hist([f[1] for f in user_frequency], bins=100, color='#ffcccc')
plt.savefig(f'{REPORT_DIR}/1-covid-tweet-frequency/{sample_name}.png')
plt.savefig(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.png')
def view_covid_tweets_pop(users: list[str],
@@ -113,13 +124,13 @@ def view_covid_tweets_pop(users: list[str],
# How many people are ignored
r.print("To prevent division by zero, we ignored people who didn't post about COVID or didn't "
f"post at all. We ignored {len(users) - len(user_popularity)} people in this list.")
f"post at all. We ignored {len(users) - len(user_popularity)} people in this list.")
r.print()
# Top 20
r.print(f"20 Users of whose COVID-related posts are the most popular:")
r.print(tabulate([[u[0], f'{u[1]:.2f}'] for u in user_popularity[:20]],
['Username', 'Popularity Ratio'], tablefmt="github"))
['Username', 'Popularity Ratio'], tablefmt="github"))
r.print()
# Calculate statistics
@@ -152,39 +163,58 @@ def view_covid_tweets_pop(users: list[str],
plt.savefig(f'{REPORT_DIR}/2-covid-tweet-popularity/{sample_name}.png')
def load_covid_tweets_pop(users: list[str]):
def calculate_sample_data(users: list[str]) -> tuple[list[UserFloat], list[UserFloat], list[Posting]]:
"""
Helper function for view_covid_tweets_pop. This function loads and calculates relative
popularity of COVID posts by a list of users
This function loads and calculates the frequency that a list of user posts about COVID, and
also calculates their relative popularity of COVID posts.
This function also creates a combined list of all users in a sample
:param users: Users in a sample
:return: List of users and their relative popularity for COVID posts
:return: Frequencies, Popularity ratios, Combined tweets list for the sample
"""
user_popularity = []
popularity = []
frequency = []
all_tweets: list[Posting] = []
for u in users:
# Load processed tweet
tweets = load_tweets(u)
# Ignore retweets
tweets = [t for t in tweets if not t.repost]
all_tweets += tweets
# Filter covid tweets
covid = [t for t in tweets if t.covid_related]
# To prevent divide by zero, ignore everyone who didn't post about covid or who didn't post
# at all.
# To prevent divide by zero, ignore people who didn't post at all
if len(tweets) == 0:
continue
# Calculate the frequency of COVID-related tweets
freq = len(covid) / len(tweets)
frequency.append(UserFloat(u, freq))
# To prevent divide by zero, ignore everyone who didn't post about covid
if len(covid) == 0 or len(tweets) == 0:
continue
# Get the average popularity for COVID-related tweets
covid_avg = statistics.mean(t.popularity for t in covid)
global_avg = statistics.mean(t.popularity for t in tweets)
# Get the relative popularity
user_popularity.append((u, covid_avg / global_avg))
popularity.append(UserFloat(u, covid_avg / global_avg))
# Sort by relative popularity
user_popularity.sort(key=lambda x: x[1], reverse=True)
return user_popularity
# Sort by relative popularity or frequency
popularity.sort(key=lambda x: x[1], reverse=True)
frequency.sort(key=lambda x: x[1], reverse=True)
# Sort by date, latest first
all_tweets.sort(key=lambda x: x.date, reverse=True)
# Ignore tweets that are earlier than the start of COVID
all_tweets = [t for t in all_tweets if t.date > '2020-01-01T01:01:01']
return frequency, popularity, all_tweets
def view_covid_tweets_date(tweets: list[Posting]):
# Graph histogram
plt.title(f'COVID posting dates')
plt.xticks(rotation=45)
@@ -195,8 +225,17 @@ def view_covid_tweets_date(tweets: list[Posting]):
if __name__ == '__main__':
sample = load_user_sample()
view_covid_tweets_freq([u.username for u in sample.most_popular], '500-pop')
# Load sample, convert format
samples = load_user_sample()
samples = [UserSample('500-pop', [u.username for u in samples.most_popular]),
UserSample('500-rand', [u.username for u in samples.random]),
UserSample('eng-news', list(samples.english_news))]
# Calculate frequencies and popularity ratios
for s in samples:
s.frequencies, s.popularity_ratios, s.tweets = calculate_sample_data(s.users)
view_covid_tweets_freq([u.username for u in samples.most_popular], '500-pop')
# view_covid_tweets_freq(sample.random, '500-rand')
# view_covid_tweets_pop(sample.most_popular, '500-pop')
# view_covid_tweets_pop(sample.random, '500-rand')