From 3c2ad5462a124c7595e77b8a8ecd206ac227e1e7 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Wed, 24 Nov 2021 21:28:57 -0500
Subject: [PATCH] [O] Change analysis structure

---
 src/process/twitter_visualization.py | 103 ++++++++++++++++++---------
 1 file changed, 71 insertions(+), 32 deletions(-)

diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py
index 390be15..7faa56e 100644
--- a/src/process/twitter_visualization.py
+++ b/src/process/twitter_visualization.py
@@ -3,6 +3,7 @@ TODO: Module Docstring
 """
 import statistics
 from typing import Any
+from dataclasses import dataclass, field
 
 from matplotlib import pyplot as plt
 from tabulate import tabulate
@@ -39,6 +40,28 @@ class Reporter:
         write(self.file, self.report)
 
 
+@dataclass()
+class UserFloat:
+    """
+    Model for which a floating point data is assigned to each user
+
+    This is used for both COVID tweet frequency and popularity ratio data, because both of these
+    are floating point data.
+    """
+    username: str
+    data: float
+
+
+@dataclass()
+class UserSample:
+    name: str
+    users: list[str]
+    frequencies: list[UserFloat] = field(default_factory=list)
+    popularity_ratios: list[UserFloat] = field(default_factory=list)
+    # Tweets by all users in a sample
+    tweets: list[Posting] = field(default_factory=list)
+
+
 def view_covid_tweets_freq(users: list[str],
                            sample_name: str) -> None:
     """
@@ -50,33 +73,21 @@ def view_covid_tweets_freq(users: list[str],
     :param sample_name: Name of the sample
     :return: None
     """
-    # Load tweets, and get the frequency of covid tweets for each user
-    user_frequency = []
-    for u in users:
-        # Load processed tweet
-        tweets = load_tweets(u)
-        # Get the frequency of COVID-related tweets
-        freq = len([1 for t in tweets if t.covid_related]) / len(tweets)
-        user_frequency.append((u, freq))
-
-    # Sort by frequency
-    user_frequency.sort(key=lambda x: x[1], reverse=True)
-
     # Init reporter
-    r = Reporter(f'{REPORT_DIR}/1-covid-tweet-frequency/{sample_name}.md')
+    r = Reporter(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.md')
     r.print(f"In {sample_name} -")
 
     # How many people didn't post about COVID?
     r.print("How many people didn't post about COVID:",
             len([a for a in user_frequency if a[1] == 0]))
     r.print("How many people have less than 1% of their posts about COVID:",
-          len([a for a in user_frequency if a[1] <= 0.01]))
+            len([a for a in user_frequency if a[1] <= 0.01]))
     r.print()
 
     # Top 20
     r.print(f"20 Users of who post COVID-related tweets most frequently:")
     r.print(tabulate([[u[0], f'{u[1] * 100:.1f}%'] for u in user_frequency[:20]],
-                   ['Username', 'Frequency'], tablefmt="github"))
+                     ['Username', 'Frequency'], tablefmt="github"))
 
     # Save report
     r.save()
@@ -86,7 +97,7 @@ def view_covid_tweets_freq(users: list[str],
     plt.xticks(rotation=90)
     plt.tight_layout()
     plt.hist([f[1] for f in user_frequency], bins=100, color='#ffcccc')
-    plt.savefig(f'{REPORT_DIR}/1-covid-tweet-frequency/{sample_name}.png')
+    plt.savefig(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.png')
 
 
 def view_covid_tweets_pop(users: list[str],
@@ -113,13 +124,13 @@ def view_covid_tweets_pop(users: list[str],
 
     # How many people are ignored
     r.print("To prevent division by zero, we ignored people who didn't post about COVID or didn't "
-          f"post at all. We ignored {len(users) - len(user_popularity)} people in this list.")
+            f"post at all. We ignored {len(users) - len(user_popularity)} people in this list.")
     r.print()
 
     # Top 20
     r.print(f"20 Users of whose COVID-related posts are the most popular:")
     r.print(tabulate([[u[0], f'{u[1]:.2f}'] for u in user_popularity[:20]],
-                   ['Username', 'Popularity Ratio'], tablefmt="github"))
+                     ['Username', 'Popularity Ratio'], tablefmt="github"))
     r.print()
 
     # Calculate statistics
@@ -152,39 +163,58 @@ def view_covid_tweets_pop(users: list[str],
     plt.savefig(f'{REPORT_DIR}/2-covid-tweet-popularity/{sample_name}.png')
 
 
-def load_covid_tweets_pop(users: list[str]):
+def calculate_sample_data(users: list[str]) -> tuple[list[UserFloat], list[UserFloat], list[Posting]]:
     """
-    Helper function for view_covid_tweets_pop. This function loads and calculates relative
-    popularity of COVID posts by a list of users
+    This function loads and calculates the frequency that a list of user posts about COVID, and
+    also calculates their relative popularity of COVID posts.
+
+    This function also creates a combined list of all users in a sample
 
     :param users: Users in a sample
-    :return: List of users and their relative popularity for COVID posts
+    :return: Frequencies, Popularity ratios, Combined tweets list for the sample
     """
-    user_popularity = []
+    popularity = []
+    frequency = []
+    all_tweets: list[Posting] = []
     for u in users:
         # Load processed tweet
         tweets = load_tweets(u)
         # Ignore retweets
         tweets = [t for t in tweets if not t.repost]
+        all_tweets += tweets
         # Filter covid tweets
         covid = [t for t in tweets if t.covid_related]
-        # To prevent divide by zero, ignore everyone who didn't post about covid or who didn't post
-        # at all.
+
+        # To prevent divide by zero, ignore people who didn't post at all
+        if len(tweets) == 0:
+            continue
+        # Calculate the frequency of COVID-related tweets
+        freq = len(covid) / len(tweets)
+        frequency.append(UserFloat(u, freq))
+
+        # To prevent divide by zero, ignore everyone who didn't post about covid
         if len(covid) == 0 or len(tweets) == 0:
             continue
         # Get the average popularity for COVID-related tweets
         covid_avg = statistics.mean(t.popularity for t in covid)
         global_avg = statistics.mean(t.popularity for t in tweets)
         # Get the relative popularity
-        user_popularity.append((u, covid_avg / global_avg))
+        popularity.append(UserFloat(u, covid_avg / global_avg))
 
-    # Sort by relative popularity
-    user_popularity.sort(key=lambda x: x[1], reverse=True)
-    return user_popularity
+    # Sort by relative popularity or frequency
+    popularity.sort(key=lambda x: x[1], reverse=True)
+    frequency.sort(key=lambda x: x[1], reverse=True)
+
+    # Sort by date, latest first
+    all_tweets.sort(key=lambda x: x.date, reverse=True)
+
+    # Ignore tweets that are earlier than the start of COVID
+    all_tweets = [t for t in all_tweets if t.date > '2020-01-01T01:01:01']
+
+    return frequency, popularity, all_tweets
 
 
 def view_covid_tweets_date(tweets: list[Posting]):
-
     # Graph histogram
     plt.title(f'COVID posting dates')
     plt.xticks(rotation=45)
@@ -195,8 +225,17 @@ def view_covid_tweets_date(tweets: list[Posting]):
 
 
 if __name__ == '__main__':
-    sample = load_user_sample()
-    view_covid_tweets_freq([u.username for u in sample.most_popular], '500-pop')
+    # Load sample, convert format
+    samples = load_user_sample()
+    samples = [UserSample('500-pop', [u.username for u in samples.most_popular]),
+               UserSample('500-rand', [u.username for u in samples.random]),
+               UserSample('eng-news', list(samples.english_news))]
+
+    # Calculate frequencies and popularity ratios
+    for s in samples:
+        s.frequencies, s.popularity_ratios, s.tweets = calculate_sample_data(s.users)
+
+    view_covid_tweets_freq([u.username for u in samples.most_popular], '500-pop')
     # view_covid_tweets_freq(sample.random, '500-rand')
     # view_covid_tweets_pop(sample.most_popular, '500-pop')
     # view_covid_tweets_pop(sample.random, '500-rand')