[U] Update formula

2021-11-26 16:20:57 -05:00
parent 54fb07fb6b
commit 7a5fb3b71e
3 changed files with 119 additions and 65 deletions
@@ -32,11 +32,16 @@ class Sample:
    user_freqs: list[UserFloat]
    # Total popularity ratios of all posts for each user across all dates (sorted)
    user_pops: list[UserFloat]
-    # Tweets by all users in a sample (always sorted by date)
-    tweets: list[Posting]
+    # Average popularity of all u's posts
+    user_all_pop_avg: dict[str, float]
+    # Average popularity of COVID tweets by a specific user on a specific date
+    # user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date}
+    user_date_covid_pop_avg: dict[str, dict[str, float]]
+    # Total COVID-tweets frequency on a specific date for all users.
+    date_covid_freq: dict[str, float]
    # dates[i] = The i-th day since the first tweet
    dates: list[datetime]
-    # date_freqs[i] = Total frequency of all posts from all users in this sample on date[i]
+    # date_freqs[i] = COVID frequency of all posts from all users in this sample on date[i]
    date_freqs: list[float]
    # date_pops[i] = Average popularity ratio of all posts from all users in this sample on date[i]
    date_pops: list[float]
@@ -67,11 +72,17 @@ class Sample:

        To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't
        post at all.
+
+        Precondition:
+          - Downloaded tweets data are sorted by date
        """
        debug(f'Calculating sample tweets data for {self.name}...')
        popularity = []
        frequency = []
-        all_tweets: list[Posting] = []
+        date_covid_count = dict()
+        date_all_count = dict()
+        self.user_all_pop_avg = dict()
+        self.user_date_covid_pop_avg = dict()
        for i in range(len(self.users)):
            u = self.users[i]

@@ -81,45 +92,72 @@ class Sample:

            # Load processed tweet
            tweets = load_tweets(u)
-            # Ignore retweets
-            tweets = [t for t in tweets if not t.repost]
-            all_tweets += tweets
+            # Ignore retweets, and ignore tweets that are earlier than the start of COVID
+            tweets = [t for t in tweets if not t.repost and t.date > '2020-01-01T01:01:01']
            # Filter covid tweets
            covid = [t for t in tweets if t.covid_related]

            # To prevent divide by zero, ignore people who didn't post at all
            if len(tweets) == 0:
+                frequency.append(UserFloat(u, 0))
                continue
            # Calculate the frequency of COVID-related tweets
            freq = len(covid) / len(tweets)
            frequency.append(UserFloat(u, freq))

+            # Calculate date fields
+            # Assume tweets are sorted
+            # tweets.sort(key=lambda x: x.date)
+            # Calculate popularity by date
+            date_cp_sum = dict()
+            date_cp_count = dict()
+            for t in tweets:
+                d = t.date[:10]
+
+                # For covid popularity on date
+                if t.covid_related:
+                    if d not in date_cp_sum:
+                        date_cp_sum[d] = 0
+                        date_cp_count[d] = 0
+                    date_cp_sum[d] += t.popularity
+                    date_cp_count[d] += 1
+
+                # For frequency on date
+                if d not in date_covid_count:
+                    date_covid_count[d] = 0
+                    date_all_count[d] = 0
+                if t.covid_related:
+                    date_covid_count[d] += 1
+                date_all_count[d] += 1
+
+            self.user_date_covid_pop_avg[u] = \
+                {d: date_cp_sum[d] / date_cp_count[d] for d in date_cp_sum}
+
+            # Calculate total popularity ratio for a user
            # To prevent divide by zero, ignore everyone who didn't post about covid
            if len(covid) == 0:
                continue
            # Get the average popularity for COVID-related tweets
-            covid_avg = sum(t.popularity for t in covid) / len(covid)
-            global_avg = sum(t.popularity for t in tweets) / len(tweets)
+            covid_pop_avg = sum(t.popularity for t in covid) / len(covid)
+            all_pop_avg = sum(t.popularity for t in tweets) / len(tweets)
+            # Save global_avg
+            self.user_all_pop_avg[u] = all_pop_avg
            # To prevent divide by zero, ignore everyone who literally have no likes on any post
-            if global_avg == 0:
+            if all_pop_avg == 0:
                continue
            # Get the relative popularity
-            popularity.append(UserFloat(u, covid_avg / global_avg))
+            popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
+
+        # Calculate frequency on date
+        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}

        # Sort by relative popularity or frequency
        popularity.sort(key=lambda x: x.data, reverse=True)
        frequency.sort(key=lambda x: x.data, reverse=True)

-        # Sort by date, latest first
-        all_tweets.sort(key=lambda x: x.date)
-
-        # Ignore tweets that are earlier than the start of COVID
-        all_tweets = [t for t in all_tweets if t.date > '2020-01-01T01:01:01']
-
        # Assign to sample
        self.user_freqs = frequency
        self.user_pops = popularity
-        self.tweets = all_tweets
        debug('- Done.')

    def calculate_change_data(self) -> None:
@@ -136,41 +174,38 @@ class Sample:

        :return: None
        """
-        # List indicies are days since the first tweet
-        covid_count = [0]
-        covid_popularity = [0]
-        all_count = [0]
-        all_popularity = [0]
-        current_date = self.tweets[0][:10]
-        i = 0
+        self.dates = []
+        self.date_freqs = []
+        self.date_pops = []

-        # Loop through all tweets
-        for tweet in self.tweets:
-            # Move on to the next date
-            tweet_date = tweet.date[:10]
-            if tweet_date != current_date:
-                current_date = tweet_date
-                covid_count.append(0)
-                covid_popularity.append(0)
-                all_count.append(0)
-                all_popularity.append(0)
-                i += 1
+        # Loop through all dates from the start of COVID to when the data is obtained
+        for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
+            self.dates.append(dt)

-            # Add current tweet data
-            all_count[i] += 1
-            all_popularity[i] += tweet.popularity
-            if tweet.covid_related:
-                covid_count[i] += 1
-                covid_popularity[i] += tweet.popularity
+            # Convert date covid freq format
+            if ds in self.date_covid_freq:
+                self.date_freqs.append(self.date_covid_freq[ds])
+            else:
+                self.date_freqs.append(0)

-        # Calculate frequency and popularity ratio for each date, which will be our y-axis
-        self.date_freqs = divide_zeros(covid_count, all_count)
-        self.date_pops = divide_zeros(divide_zeros(covid_popularity, covid_count),
-                                      divide_zeros(all_popularity, all_count))
+            # Calculate date covid popularity ratio
+            users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
+                                  ds in self.user_date_covid_pop_avg[u]]
+            if len(users_posted_today) != 0:
+                user_pop_ratio_sum = sum(self.user_date_covid_pop_avg[u][ds] /
+                                         self.user_all_pop_avg[u] for u in users_posted_today
+                                         if self.user_all_pop_avg[u] != 0)
+                pops_i = user_pop_ratio_sum / len(users_posted_today)

-        # Convert indicies to dates, which will be our x-axis
-        first_date = parse_date(self.tweets[0].date).replace(hour=0, minute=0, second=0)
-        self.dates = [first_date + timedelta(days=j) for j in range(len(all_count))]
+                if pops_i > 20:
+                    print('Date: ', ds)
+                    for u in users_posted_today:
+                        if self.user_all_pop_avg[u] != 0:
+                            print('-', u, self.user_date_covid_pop_avg[u][ds] /
+                                  self.user_all_pop_avg[u])
+            else:
+                pops_i = 1
+            self.date_pops.append(pops_i)


 def load_samples() -> list[Sample]:
@@ -215,7 +250,7 @@ def report_ignored(samples: list[Sample]) -> None:
    :return: None
    """
    # For frequencies, report who didn't post
-    table = [["Total users"] + [str(len(s.user_freqs)) for s in samples],
+    table = [["Total users"] + [str(len(s.users)) for s in samples],
             ["Users who didn't post at all"] +
             [str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples],
             ["Users who posted less than 1%"] +
@@ -390,16 +425,6 @@ def report_stats(samples: list[Sample]) -> None:
    Reporter('freq/stats.md').table(table, [s.name for s in samples], True)


-def view_covid_tweets_date(tweets: list[Posting]):
-    # Graph histogram
-    plt.title(f'COVID posting dates')
-    plt.xticks(rotation=45)
-    plt.yticks(rotation=45)
-    plt.tight_layout()
-    plt.hist([parse_date(t.date) for t in tweets if t.covid_related], bins=40, color='#ffcccc')
-    plt.show()
-
-
 def report_change_different_n(sample: Sample) -> None:
    """
    Experiment wth different n values for IIR filter
@@ -420,6 +445,7 @@ def report_change_graphs(sample: Sample) -> None:
    graph_line_plot(sample.dates, sample.date_freqs, f'change/freq/{sample.name}.png',
                    f'COVID-posting frequency over time for {sample.name} IIR(10)',
                    True, 10)
+    print(sum(sample.date_pops) / len(sample.dates))


 def report_all() -> None:
@@ -12,7 +12,7 @@ Our data come from three samples:

 ## COVID-19 Posting Frequency

-First, we analyzed how frequently the users in these three datasets are posing about COVID-19. Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19:
+First, we analyzed how frequently the users in these three datasets are posing about COVID-19 (ignoring retweets). Initially, we were expecting that most people will post about COVID-19 because this pandemic is very relevant to every one of us. However, we found that there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19:

@include `/freq/didnt-post.md`

@@ -80,7 +80,7 @@ $$ \text{freqs}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on
 </blockquote>

 <blockquote>
-$$ \text{pops}_i = \left(\frac{\sum\text{Popularity of COVID-posts on date}_i}{|\text{COVID-posts on date}_i|}\right) / \left(\frac{\sum \text{Popularity of all posts on date}_i}{|\text{All posts on date}_i|}\right) $$
+$$ \text{pops}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{Popularity of u's COVID-posts on date}_i}{(\text{Average popularity of all u's posts}) \cdot |\text{u's COVID-posts on date}_i|}\right)}{(\text{Number of users posted on date}_i)} $$
 </blockquote>

 After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. When we plot the graph without a filter, we found that the graph is actually very noisy as shown in the first graph below. So, we experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
@@ -4,9 +4,9 @@ import json
 import os
 import statistics
 from dataclasses import dataclass
-from datetime import datetime, date
+from datetime import datetime, date, timedelta
 from pathlib import Path
-from typing import Union, NamedTuple, Any
+from typing import Union, NamedTuple, Any, Generator

 import json5
 import numpy as np
@@ -230,7 +230,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
            ]


-def parse_date(iso: str) -> datetime:
+def parse_date_time(iso: str) -> datetime:
    """
    Parse date faster. Running 1,000,000 trials, this parse_date function is 4.03 times faster than
    python's built-in dateutil.parser.isoparse() function.
@@ -246,6 +246,34 @@ def parse_date(iso: str) -> datetime:
                    int(iso[11:13]), int(iso[14:16]), int(iso[17:19]))


+def parse_date_only(iso: str) -> datetime:
+    """
+    Parse date faster.
+
+    Preconditions:
+      - iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20")
+      - iso is a valid date (this function does not check for the validity of the input)
+
+    :param iso: Input date
+    :return: Datetime object
+    """
+    return datetime(int(iso[:4]), int(iso[5:7]), int(iso[8:10]))
+
+
+def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], None, None]:
+    """
+    Date range for looping
+
+    :param start_date: Start date in "YYYY-MM-DD" format
+    :param end_date: End date in "YYYY-MM-DD" format
+    :return: Generator for looping through the dates one day at a time.
+    """
+    start = parse_date_only(start_date)
+    for n in range(int((parse_date_only(end_date) - start).days)):
+        dt = start + timedelta(n)
+        yield dt.strftime('%Y-%m-%d'), dt
+
+
 def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
    """
    Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)