From ea27d0fec2a0d656ac72fae638e3c23a091b3cac Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Wed, 24 Nov 2021 15:44:08 -0500
Subject: [PATCH] [+] Visualize covid tweets popularity ratio

---
 src/process/twitter_visualization.py | 64 +++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py
index 37f233c..6cc5581 100644
--- a/src/process/twitter_visualization.py
+++ b/src/process/twitter_visualization.py
@@ -51,7 +51,67 @@ def view_covid_tweets_freq(users: list[ProcessedUser],
     plt.show()
 
 
+def view_covid_tweets_pop(users: list[ProcessedUser],
+                          sample_name: str) -> None:
+    """
+    Visualize the relative popularity of the sampled users' posts about COVID. For example, if one
+    person posted a COVID post and got 1000 likes, while their other posts (including this one) got
+    an average of 1 like, they will have a relative popularity of 1000. If, on the other hand, one
+    person posted a COVID post and got 1 like, while their other posts (including this one) got an
+    average of 1000 likes, they will have a relative popularity of 1/1000.
+
+    To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't post
+    at all.
+
+    :param users: Sample users
+    :param sample_name: Name of the sample
+    :return: None
+    """
+    # Load tweets, and get the frequency of covid tweets for each user
+    user_popularity = []
+    for u in users:
+        # Load processed tweet
+        tweets = load_tweets(u.username)
+        # Ignore retweets
+        tweets = [t for t in tweets if not t.repost]
+        # Filter covid tweets
+        covid = [t for t in tweets if t.covid_related]
+        # To prevent divide by zero, ignore everyone who didn't post about covid or who didn't post
+        # at all.
+        if len(covid) == 0 or len(tweets) == 0:
+            continue
+        # Get the average popularity for COVID-related tweets
+        covid_avg = sum(t.popularity for t in covid) / len(covid)
+        global_avg = sum(t.popularity for t in tweets) / len(tweets)
+        # Get the relative popularity
+        user_popularity.append((u.username, covid_avg / global_avg))
+
+    # Sort by relative popularity
+    user_popularity.sort(key=lambda x: x[1], reverse=True)
+
+    # How many people are ignored
+    print(f"In {sample_name} -")
+    print("To prevent division by zero, we ignored people who didn't post about COVID or didn't "
+          f"post at all. We ignored {len(users) - len(user_popularity)} people in this list.")
+    print()
+
+    # Top 20
+    print(f"20 Users of whose COVID-related posts are the most popular:")
+    print(tabulate([[u[0], f'{u[1]:.2f}'] for u in user_popularity[:20]],
+                   ['Username', 'Popularity Ratio']))
+
+    # Graph histogram
+    plt.title(f'COVID-related popularity ratios for {sample_name}')
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    plt.hist([f[1] for f in user_popularity], bins=100, color='#ffcccc')
+    plt.axvline([1], color='lightgray')
+    plt.show()
+
+
 if __name__ == '__main__':
     sample = load_user_sample()
-    view_covid_tweets_freq(sample.most_popular, '500 most popular Twitter users')
-    view_covid_tweets_freq(sample.random, '500 random Twitter users')
+    # view_covid_tweets_freq(sample.most_popular, '500 most popular Twitter users')
+    # view_covid_tweets_freq(sample.random, '500 random Twitter users')
+    view_covid_tweets_pop(sample.most_popular, '500 most popular Twitter users')
+    view_covid_tweets_pop(sample.random, '500 random Twitter users')