[+] Test py

2021-12-13 00:02:17 -05:00
53 changed files with 317 additions and 607 deletions
@@ -127,5 +127,3 @@ dmypy.json
 config.json5
 data/
 /report/
-/src/report
-.DS_Store
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RunConfigurationProducerService">
+    <option name="ignoredProducers">
+      <set>
+        <option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
+      </set>
+    </option>
+  </component>
+</project>
@@ -1,4 +0,0 @@
-# CSC110-Project
-
-Our final project for CSC110! This is a study on the frequency and popularity of COVID-19-related discussions on Twitter. 
-Full paper hosted on https://csc110.hydev.org.
@@ -3,10 +3,8 @@
 # abort on errors
 set -e

-cp ../data/packed/processed.7z ./src/dist/processed-data.7z
-
 # navigate into the build output directory
-cd src/dist
+cd dist

 # if you are deploying to a custom domain
 echo 'csc110.hydev.org' > CNAME
@@ -4,31 +4,31 @@
 # Json5 is a human-readable json format that allows for things such as unquoted keys or comments.
 json5~=0.9.6
 # Tweepy is a python SDK for twitter
-tweepy~=4.4.0
+tweepy==4.4.0
 # requests is for getting html from a website URL
-requests~=2.26.0
+requests==2.26.0
 # beautifulsoup is used to extract data from html
-beautifulsoup4~=4.10.0
+beautifulsoup4==4.10.0

 #####################
 # Data Visualization
 # Print table data
-tabulate~=0.8.9
+tabulate==0.8.9
 # Draw local graphs
-matplotlib~=3.5.1
+matplotlib==3.5.0
 # Calculate data statistics
-numpy~=1.21.4
+numpy==1.21.4
 # Date utility for manipulating dates
 python-dateutil~=2.8.2
 # Scipy for transforming data. We used it for IIR filtering.
 scipy~=1.7.3
 # For serving the report website
-flask~=2.0.2
+flask==2.0.2

 ####################
 # Data Packing
 # 7zip packing utility for packing our processed data
-py7zr~=0.17.0
+py7zr==0.16.3

 #####################
 # Testing and code checking
@@ -1,56 +0,0 @@
-"""CSC110 Fall 2021 Project
-This module uses web requests to collect and process other data we are using in our analysis.
-"""
-
-from dataclasses import dataclass
-
-import requests
-import python_ta
-import python_ta.contracts
-
-
-@dataclass
-class CasesData:
-    """
-    A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths
-    on that day.
-
-    Attributes:
-        - cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
-        - deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date
-
-    Representation Invariants:
-        - all(x >= 0 for x in self.cases.values())
-        - all(x >= 0 for x in self.deaths.values())
-    """
-    cases: dict[str, float]
-    deaths: dict[str, float]
-
-
-def get_covid_cases_us() -> CasesData:
-    """
-    Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
-
-    :return: Cases data
-    """
-    url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
-    csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
-    data = CasesData({}, {})
-
-    # Parse CSV
-    for line in csv:
-        split = line.split(',')
-        day, cases, deaths = split[0], split[2], split[6]
-        data.cases[day] = float(cases)
-        data.deaths[day] = float(deaths)
-    return data
-
-
-if __name__ == '__main__':
-    python_ta.contracts.check_all_contracts()
-    python_ta.check_all(config={
-        'extra-imports': ['requests', 'dataclasses'],  # the names (strs) of imported modules
-        'allowed-io': [],  # the names (strs) of functions that call print/open/input
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200']
-    })
@@ -1,8 +0,0 @@
-{
-    // Twitter official V2 API keys
-    // Register for them on the Twitter api portal
-    consumer_key: 'Your_consumer_key',
-    consumer_secret: 'Your_consumer_secret',
-    access_token: 'Your_access_token',
-    access_secret: 'Your_access_secret',
-}
@@ -1,35 +1,10 @@
-"""CSC110 Fall 2021 Project
-This module stores constant variables in our projects.
-
-Instructors said that we can use global constants: https://piazza.com/class/ksovzjrlsye72f?cid=1664
-
-Note: Paths should not end with "/"
-"""
-
-"""
-File structure:
- 
-data           - Processed and raw data
-├── packed         - Packed data
-└── twitter        - Data obtained from Twitter
-    ├── user           - Twitter user info data
-    │   ├── meta           - Meta-data about the follows-chain downloading progress
-    │   ├── processed      - Processed (filtered) user data.
-    │   └── users          - Raw user info, each json contains info of one user.
-    └── user-tweets    - Tweets data
-        ├── processed      - Processed tweets.
-        └── user           - Raw tweets, each json contains all tweets from a user.
-        
-src         - Source codes.
-├── report      - Report content generated by report_all() @ visualization.py
-├── dist        - Static website root generated by write_html() @ report.py
-└── resources   - HTML static resources, some are hand-written and some are imported libraries.
-"""
+# Constants (The instructors said that we can use global constants here:
+# https://piazza.com/class/ksovzjrlsye72f?cid=1664
+# They should not end with "/"
 DATA_DIR = '../data'
 TWEETS_DIR = f'{DATA_DIR}/twitter/user-tweets'
 USER_DIR = f'{DATA_DIR}/twitter/user'
 REPORT_DIR = './report'
-RES_DIR = './resources'

 # Debug mode, or developer mode. This affects two things:
 # 1. Whether debug messages are outputted
@@ -1,22 +1,16 @@
-"""CSC110 Fall 2021 Project
-This module is the main module of our program which runs different functions in different modules
-by steps.
-"""
+from tabulate import tabulate

-from collect_twitter import *
-from processing import *
-from report import *
+from process.twitter_process import *
+from process.twitter_visualization import *
+from raw_collect.twitter import *
+from report.report import serve_report
 from utils import *
-from visualization import *
+

 if __name__ == '__main__':
-    #####################
-    # Data collection - Step C1.0
    # Load config and create API
-    # This is required if you would like to collect the data yourself.
-    # In that case, you also need to register for Twitter API keys and add them to config.json5
-    # conf = load_config('config.json5')
-    # api = tweepy_login(conf)
+    conf = load_config('config.json5')
+    api = tweepy_login(conf)

    #####################
    # Data collection - Step C1.1
@@ -25,7 +19,7 @@ if __name__ == '__main__':
    # manually stop it when there are enough users)
    # download_users_start(api, 'voxdotcom')

-    # This task will run for a very, very long time to obtain a large dataset of Twitter users. If
+    # This task will run for a very very long time to obtain a large dataset of twitter users. If
    # you want to stop the process, you can resume it later using the following line:
    # download_users_resume_progress(api)

@@ -36,7 +30,7 @@ if __name__ == '__main__':

    #####################
    # Data processing - Step P1
-    # (After step C1) Process the downloaded Twitter users, extract screen name, popularity, and
+    # (After step C1) Process the downloaded twitter users, extract screen name, popularity, and
    # number of tweets data.
    # process_users()

@@ -46,9 +40,13 @@ if __name__ == '__main__':
    # criteria as our sample, also find news channels
    # select_user_sample()

+    # Just curious, who are the 20 most popular individuals on twitter?
+    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
+    #                headers=['Name', 'Followers']))
+
    #####################
    # Data collection - Step C2.1
-    # (After step P2) Load the downloaded Twitter users by popularity, and start downloading all
+    # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
    # tweets from 500 of the most popular users. Takes around 2 hours.
    # for u in load_user_sample().most_popular:
    #     download_all_tweets(api, u.username)
@@ -64,7 +62,7 @@ if __name__ == '__main__':
    # (After step P2) Download all tweets from the news channels we selected.
    # for u in load_user_sample().english_news:
    #     download_all_tweets(api, u)
-    # Filter out news channels that have been blocked by twitter or don't exist
+    # Filter out news channels that have been blocked by twitter or don't exist anymore
    # filter_news_channels()

    #####################
@@ -77,9 +75,7 @@ if __name__ == '__main__':
    # Generate all visualization reports and graphs
    report_all()

-    # Write HTML for deploying to GitHub Pages
-    write_html()
-
+    ####################
    # Serve webpage
    serve_report()

@@ -1,49 +1,37 @@
-"""CSC110 Fall 2021 Project
+"""
 Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
 users, creating samples of users, filtering news channels, and processing tweets for file storage.
 """
-import json
-import os
 import random
-import sys
-import zipfile
-from dataclasses import dataclass
-from datetime import datetime
-from pathlib import Path
 from typing import NamedTuple
+from dataclasses import dataclass

+import dateutil.parser
 import requests
 from bs4 import BeautifulSoup
 from py7zr import SevenZipFile

-import python_ta
-
-from constants import DATA_DIR, TWEETS_DIR, USER_DIR, RES_DIR
-from utils import read, debug, write, json_stringify
+from constants import DATA_DIR, TWEETS_DIR, USER_DIR
+from utils import *


 class ProcessedUser(NamedTuple):
    """
    User and popularity.

-    We use NamedTuple instead of dataclass because named tuples are easier to serialize in JSON, and
+    We use NamedTuple instead of dataclass because named tuples are easier to serialize in JSON and
    they require much less space in the stored json format because no key info is stored. For
    example, using dataclass, the json for one UserPopularity object will be:
    {"username": "a", "popularity": 1, "num_postings": 1}, while using NamedTuple, the json will be:
    ["a", 1, 1], which saves an entire 42 bytes for each user.
-
-    Attributes:
-        - username: The Twitter user's screen name
-        - popularity: A measurement of a user's popularity, such as followers count
-        - num_postings: Number of tweets
-        - language: Language code in Twitter's language code format
-
-    Representation Invariants:
-        - self.username != ''
    """
+    # Username
    username: str
+    # A measurement of a user's popularity, such as followers count
    popularity: int
+    # Number of tweets
    num_postings: int
+    # Language
    lang: str


@@ -61,7 +49,7 @@ def process_users() -> None:

    # Loop through all the files
    for filename in os.listdir(f'{USER_DIR}/users'):
-        # Only check json files and ignore macOS dot files
+        # Only check json files and ignore macos dot files
        if filename.endswith('.json') and not filename.startswith('.'):
            # Read
            user = json.loads(read(f'{USER_DIR}/users/{filename}'))
@@ -115,14 +103,6 @@ def get_user_popularity_ranking(user: str) -> int:
 class UserSample:
    """
    This is a data class storing our different samples.
-
-    Attributes:
-        - most_popular: Our sample of the most popular users on Twitter
-        - random: Our sample of random users on Twitter
-        - english_news: Our sample of news media accounts on Twitter
-
-    Representation Invariants:
-        - all(news != '' for news in self.english_news)
    """
    most_popular: list[ProcessedUser]
    random: list[ProcessedUser]
@@ -153,8 +133,8 @@ def select_user_sample() -> None:
    users = load_users()

    # Filter by language first
-    users = [u for u in users if u.lang is not None
-             and any(lang in u.lang for lang in {'en', 'zh', 'ja'})]
+    users = [u for u in users if u.lang is not None and
+             any(lang in u.lang for lang in {'en', 'zh', 'ja'})]

    # Find most popular, and exclude them from the random sample
    most_popular = users[:500]
@@ -178,8 +158,8 @@ def get_english_news_channels() -> list[str]:

    Run this after download_all_tweets(api, 'TwitterNews')

-    Preconditions:
-        - <tweets_dir>/user/TwitterNews.json exists.
+    Precondition:
+      - <tweets_dir>/user/TwitterNews.json exists.

    :return: A list of news channel screen names
    """
@@ -196,7 +176,7 @@ def get_english_news_channels() -> list[str]:
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    users = {h.text[1:] for h in soup.select('table tr td:nth-child(2) > a')}

-    # Combine two sets, ignoring case (since the ids in the 100 list are all lowercase)
+    # Combine two sets, ignoring case (since the ids in the 100 list are all lowercased)
    news_channels_lower = {n.lower() for n in news_channels}
    for u in users:
        if u not in news_channels_lower:
@@ -209,8 +189,8 @@ def filter_news_channels() -> None:
    """
    Filter out news channels that don't exist anymore or have been banned by Twitter.

-    Preconditions:
-        - Run this after downloading all tweets from the news channels in Step 2.3 in main.
+    Precondition:
+      - Run this after downloading all tweets from the news channels in Step 2.3 in main.

    :return: None
    """
@@ -237,39 +217,34 @@ def load_user_sample() -> UserSample:

 class Posting(NamedTuple):
    """
-    Posting data stores the processed tweets' data, and it contains info such as whether a tweet is
-    covid-related
-
-    Attributes:
-        - covid_related: True if the post is determined to be covid-related
-        - popularity: A measure of tweet popularity measured by comments + likes
-        - repost: Whether the post is a repost
-        - date: Posting date and time in ISO format ("YYYY-MM-DDThh-mm-ss")
-
-    Representation Invariants:
-        - self.popularity >= 0
+    Posting data stores the processed tweets data, and it contains info such as whether or not a
+    tweet is covid-related
    """
+    # Full text of the post's content
    covid_related: bool
+    # Popularity of the post
    popularity: int
+    # Is it a repost
    repost: bool
+    # Date in ISO format
    date: str


 def process_tweets() -> None:
    """
-    Process tweets, reduce the tweets' data to only a few fields defined in the Posting class. These
-    include whether the tweet is covid-related, how popular is the tweet, if it is a repost, and its
-    date. The processed tweet does not contain its content.
+    Process tweets, reduce the tweets data to only a few fields defined in the Posting class. These
+    include whether or not the tweet is covid-related, how popular is the tweet, if it is a repost,
+    and its date. The processed tweet does not contain its content.

    If a user's tweets is already processed, this function will skip over that user's data.

-    This function will save the processed tweets' data to <tweets_dir>/processed/<username>.json
+    This function will save the processed tweets data to <tweets_dir>/processed/<username>.json

    :return: None
    """
    # Loop through all the files
    for filename in os.listdir(f'{TWEETS_DIR}/user'):
-        # Only check json files and ignore macOS dot files
+        # Only check json files and ignore macos dot files
        if filename.endswith('.json') and not filename.startswith('.'):
            # Check if already processed
            if os.path.isfile(f'{TWEETS_DIR}/processed/{filename}'):
@@ -303,8 +278,8 @@ def load_tweets(username: str) -> list[Posting]:
 def is_covid_related(text: str) -> bool:
    """
    Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
-    though we know that not all posts with covid-related words are covid-related posts, this is
-    currently our best method of classification.
+    though we know that not all posts with covid-related words are covid-related posts, this is our
+    current best method of classification.

    :param text: Text content
    :return: Whether the text is covid related
@@ -328,68 +303,19 @@ def is_covid_related(text: str) -> bool:

 def pack_data() -> None:
    """
-    This function packs processed data and raw data separately, and it also packs the data ready for
-    submission on MarkUs
+    This function packs processed data and raw data separately.

    :return: None
    """
    packed_dir = f'{DATA_DIR}/packed'
    Path(packed_dir).mkdir(parents=True, exist_ok=True)
-    packed_data = f'{packed_dir}/processed.7z'
-    packed_res = f'{packed_dir}/resources.7z'

-    # Pack processed data (Since packing this takes a long time, we decided to not overwrite it for
-    # every run. This is also because the processed data hasn't changed since Nov 28 when the
-    # project is mostly finished, and there is no need to re-pack data every time, whereas the
-    # resources and sources might change after every update) So, delete the packed 7z file if you
-    # want to repack.
-    if not os.path.isfile(packed_data):
-        debug('Packing data...')
-        processed_dirs = ['/twitter/user/meta', '/twitter/user/processed',
-                          '/twitter/user-tweets/processed']
-        with SevenZipFile(packed_data, 'w') as z:
-            z: SevenZipFile = z
-            for p in processed_dirs:
-                debug(f'- Packing {p}')
-                z.writeall(DATA_DIR + p)
-
-    # Pack resources
-    debug('Packing resources...')
-    with SevenZipFile(packed_res, 'w') as z:
+    # Pack data for processed.
+    debug('Packing data...')
+    processed_dirs = ['/twitter/user/meta', '/twitter/user/processed',
+                      '/twitter/user-tweets/processed']
+    with SevenZipFile(f'{packed_dir}/processed.7z', 'w') as z:
        z: SevenZipFile = z
-        z.writeall(RES_DIR)
-
-    # Pack MarkUs submission
-    # Even though 7zip has much better compression rate than zip, MarkUs only supports zip.
-    debug('Packing source code...')
-    with zipfile.ZipFile(f'{packed_dir}/markus.zip', 'w') as zf:
-        z: zipfile.ZipFile = zf
-
-        # Add sources
-        src_path = Path(os.path.realpath(__file__)).parent
-        for f in os.listdir(src_path):
-            if not os.path.isdir(f) and f != '.DS_Store' and not f.startswith('._'):
-                z.write(f)
-
-        # Add packed resource
-        z.write(packed_res, 'resources.7z')
-
-        # Add report tex
-        for file in ['project_report.tex', 'project_report.pdf']:
-            z.write(os.path.join(src_path, f'../writing/report/{file}'), file)
-
-    # Open packed location (Since there isn't a platform-independent way of doing this, we currently
-    # only support macOS)
-    if sys.platform == 'darwin':
-        os.system(f'open {Path(packed_dir).absolute()}')
-
-
-if __name__ == '__main__':
-    python_ta.check_all(config={
-        'extra-imports': ['json', 'os', 'random', 'sys', 'zipfile', 'dataclasses', 'datetime',
-                          'pathlib', 'typing', 'requests', 'bs4', 'py7zr', 'constants', 'utils'
-                          ],  # the names (strs) of imported modules
-        'allowed-io': [],  # the names (strs) of functions that call print/open/input
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200']
-    }, output='pyta_report.html')
+        for p in processed_dirs:
+            debug(f'- Packing {p}')
+            z.writeall(DATA_DIR + p)
@@ -1,28 +1,21 @@
-"""CSC110 Fall 2021 Project
-This module uses matplotlib to visualize processed data as graphs. The results are stored in
-report directory.
+"""
+This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
 The graphs are created after processing the data, for example with filtering and removing outliers.
 """
+from datetime import timedelta
+from dataclasses import dataclass, field
+from typing import Optional

-import os.path
-from dataclasses import dataclass
-from datetime import datetime
-from pathlib import Path
-from typing import Optional, Union
-
-import matplotlib.dates as mdates
 import matplotlib.ticker
+import numpy as np
+import requests
 import scipy.signal
 from matplotlib import pyplot as plt, font_manager
+import matplotlib.dates as mdates
+from matplotlib import cm

-import python_ta
-import python_ta.contracts
-
-from collect_others import get_covid_cases_us
-from constants import RES_DIR, REPORT_DIR
-from processing import load_tweets, load_user_sample
-from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \
-    tabulate_stats, get_statistics
+from process.twitter_process import *
+from raw_collect.others import get_covid_cases_us


@dataclass()
@@ -32,49 +25,24 @@ class UserFloat:

    This is used for both COVID tweet frequency and popularity ratio data, because both of these
    are floating point data.
-
-    Attributes:
-        - name: Twitter user's screen name
-        - data: The float data that's associated with this user
-
-    Representation Invariants:
-        - self.name != ''
    """
    name: str
    data: float


 class Sample:
-    """
-    A sample of many users, containing statistical data that will be used in graphs.
-
-    Attributes:
-        - name: Sample name
-        - users: List of user screen names in this sample
-        - user_freqs: Total frequencies of all posts for each user across all dates (sorted)
-        - user_pops: Total popularity ratios of all posts for each user across all dates (sorted)
-        - user_all_pop_avg: Average popularity of all u's posts
-        - user_date_covid_pop_avg: Average popularity of COVID tweets by a specific user on a date
-        (user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date})
-        - date_covid_freq: Total COVID-tweets frequency on a specific date for all users.
-        - dates: dates[i] = The i-th day since the first tweet
-        - date_freqs: date_freqs[i] = COVID frequency of all posts from all sampled users on date[i]
-        - date_pops: date_pops[i] = Average pop-ratio of all posts from all sampled users on date[i]
-
-    Representation Invariants:
-        - self.name != ''
-        - all(name != '' for name in self.users)
-    """
    name: str
    users: list[str]
-
+    # Total frequencies of all posts for each user across all dates (sorted)
    user_freqs: list[UserFloat]
+    # Total popularity ratios of all posts for each user across all dates (sorted)
    user_pops: list[UserFloat]
+    # Average popularity of all u's posts
    user_all_pop_avg: dict[str, float]
-
+    # Average popularity of COVID tweets by a specific user on a specific date
    # user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date}
    user_date_covid_pop_avg: dict[str, dict[str, float]]
-
+    # Total COVID-tweets frequency on a specific date for all users.
    date_covid_freq: dict[str, float]
    # dates[i] = The i-th day since the first tweet
    dates: list[datetime]
@@ -83,7 +51,7 @@ class Sample:
    # date_pops[i] = Average popularity ratio of all posts from all users in this sample on date[i]
    date_pops: list[float]

-    def __init__(self, name: str, users: list[str]) -> None:
+    def __init__(self, name: str, users: list[str]):
        self.name = name
        self.users = users
        self.calculate_sample_data()
@@ -110,16 +78,16 @@ class Sample:
        To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't
        post at all.

-        Preconditions:
+        Precondition:
          - Downloaded tweets data are sorted by date
        """
        debug(f'Calculating sample tweets data for {self.name}...')
        popularity = []
        frequency = []
-        date_covid_count = {}
-        date_all_count = {}
-        self.user_all_pop_avg = {}
-        self.user_date_covid_pop_avg = {}
+        date_covid_count = dict()
+        date_all_count = dict()
+        self.user_all_pop_avg = dict()
+        self.user_date_covid_pop_avg = dict()
        for i in range(len(self.users)):
            u = self.users[i]

@@ -139,14 +107,15 @@ class Sample:
                frequency.append(UserFloat(u, 0))
                continue
            # Calculate the frequency of COVID-related tweets
-            frequency.append(UserFloat(u, len(covid) / len(tweets)))
+            freq = len(covid) / len(tweets)
+            frequency.append(UserFloat(u, freq))

            # Calculate date fields
            # Assume tweets are sorted
            # tweets.sort(key=lambda x: x.date)
            # Calculate popularity by date
-            date_cp_sum = {}
-            date_cp_count = {}
+            date_cp_sum = dict()
+            date_cp_count = dict()
            for t in tweets:
                d = t.date[:10]

@@ -167,15 +136,15 @@ class Sample:
                date_all_count[d] += 1

            self.user_date_covid_pop_avg[u] = \
-                {date: date_cp_sum[date] / date_cp_count[date] for date in date_cp_sum}
+                {d: date_cp_sum[d] / date_cp_count[d] for d in date_cp_sum}

            # Calculate total popularity ratio for a user
            # To prevent divide by zero, ignore everyone who didn't post about covid
            if len(covid) == 0:
                continue
            # Get the average popularity for COVID-related tweets
-            covid_pop_avg = sum(tweet.popularity for tweet in covid) / len(covid)
-            all_pop_avg = sum(tweet.popularity for tweet in tweets) / len(tweets)
+            covid_pop_avg = sum(t.popularity for t in covid) / len(covid)
+            all_pop_avg = sum(t.popularity for t in tweets) / len(tweets)
            # Save global_avg
            self.user_all_pop_avg[u] = all_pop_avg
            # To prevent divide by zero, ignore everyone who literally have no likes on any post
@@ -185,8 +154,7 @@ class Sample:
            popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))

        # Calculate frequency on date
-        self.date_covid_freq = {date: date_covid_count[date] / date_all_count[date] for date in
-                                date_covid_count}
+        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}

        # Sort by relative popularity or frequency
        popularity.sort(key=lambda x: x.data, reverse=True)
@@ -222,8 +190,8 @@ class Sample:
            self.dates.append(dt)

            # Calculate date covid popularity ratio
-            users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg
-                                  and ds in self.user_date_covid_pop_avg[u]]
+            users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
+                                  ds in self.user_date_covid_pop_avg[u]]
            if len(users_posted_today) == 0:
                seven_days_user_prs.append([])
            else:
@@ -232,10 +200,6 @@ class Sample:
                seven_days_user_prs.append(user_prs)

            # Average over seven days
-            # python_ta thinks user_prs is being shadowed here but it's not because the other
-            # instance is stuck in the else statement above
-            # python_ta also thinks that user_prs is possibly not defined here
-            # but it's in a comprehension so it is
            seven_days_count = sum(len(user_prs) for user_prs in seven_days_user_prs)
            if seven_days_count == 0:
                pops_i = 1
@@ -271,7 +235,7 @@ def load_samples() -> list[Sample]:
    keys = ['en', 'zh', 'ja']
    pop_lang = [u.lang for u in users.most_popular]
    rand_lang = [u.lang for u in users.random]
-    Reporter('sample-demographics.md') \
+    Reporter('sample-demographics.md')\
        .table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
                ['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
               ['Total', 'English', 'Chinese', 'Japanese'], False)
@@ -307,10 +271,10 @@ def report_ignored(samples: list[Sample]) -> None:
    """
    # For frequencies, report who didn't post
    table = [["Total users"] + [str(len(s.users)) for s in samples],
-             ["Users who didn't post at all"]
-             + [str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples],
-             ["Users who posted less than 1%"]
-             + [str(len([1 for a in s.user_freqs if a.data < 0.01])) for s in samples]]
+             ["Users who didn't post at all"] +
+             [str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples],
+             ["Users who posted less than 1%"] +
+             [str(len([1 for a in s.user_freqs if a.data < 0.01])) for s in samples]]

    Reporter('freq/didnt-post.md').table(table, [s.name for s in samples], True)

@@ -323,7 +287,7 @@ def graph_load_font() -> None:
    """
    Load iosevka font for matplotlib
    """
-    font = os.path.join(RES_DIR, 'iosevka-ss04-regular.ttf')
+    font = Path(os.path.realpath(__file__)).absolute().parent.joinpath('iosevka-ss04-regular.ttf')
    fe = font_manager.FontEntry(font, 'iosevka')
    font_manager.fontManager.ttflist.insert(0, fe)
    plt.rcParams["font.family"] = "iosevka"
@@ -393,7 +357,9 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],
    """
    # Filter
    if n > 0:
-        y = scipy.signal.lfilter([1.0 / n] * n, 1, y)
+        b = [1.0 / n] * n
+        a = 1
+        y = scipy.signal.lfilter(b, a, y)

    border_color = '#5b3300'

@@ -440,7 +406,9 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],

        # Plotting frequency, add in the COVID cases data
        if freq:
-            c = map_to_dates(get_covid_cases_us(), [d.isoformat()[:10] for d in x])
+            cases = get_covid_cases_us()
+            c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
+            # c = scipy.signal.savgol_filter(c, 45, 2)
            c = filter_days_avg(c, 7)
            c = scipy.signal.lfilter([1.0 / n] * n, 1, c)

@@ -515,20 +483,13 @@ def report_change_different_n(sample: Sample) -> None:
    :param sample: Sample
    :return: None
    """
-    for n in [5, 10, 15]:
+    for n in range(5, 16, 5):
        graph_line_plot(sample.dates, sample.date_pops, f'change/n/{n}.png',
                        f'COVID-posting popularity ratio over time for {sample.name} IIR(n={n})',
                        False, n)


 def report_change_graphs(sample: Sample) -> None:
-    """
-    Report COVID-posting popularity ratio vs. time and COVID-posting frequency vs time,
-    both with IIR(10) filter
-
-    :param sample: Sample
-    :return: None
-    """
    graph_line_plot(sample.dates, sample.date_pops, f'change/pop/{sample.name}.png',
                    f'COVID-posting popularity ratio over time for {sample.name} IIR(10)',
                    False, 10)
@@ -540,9 +501,6 @@ def report_change_graphs(sample: Sample) -> None:
 def report_all() -> None:
    """
    Generate all reports
-
-    Preconditions:
-        - Twitter data have been downloaded and processed.
    """
    graph_load_font()

@@ -563,8 +521,6 @@ def report_all() -> None:
        report_change_graphs(s)
    report_change_different_n(samples[0])

-    # python_ta thinks that s is shadowing again but the other instance is in the for loop above
-    # or in another comprehension so clearly there is no shadowing
    graph_line_plot(samples[0].dates, [s.date_pops for s in samples], 'change/comb/pop.png',
                    'COVID-posting popularity ratio over time for all samples - IIR(10)', False, 10,
                    labels=[s.name for s in samples])
@@ -574,13 +530,12 @@ def report_all() -> None:


 if __name__ == '__main__':
-    # python_ta.contracts.check_all_contracts()
-    python_ta.check_all(config={
-        'extra-imports': ['os.path', 'dataclasses', 'datetime', 'pathlib', 'typing', 'matplotlib',
-                          'matplotlib.dates', 'matplotlib.ticker', 'scipy.signal', 'collect_others',
-                          'processing', 'constants', 'utils'
-                          ],  # the names (strs) of imported modules
-        'allowed-io': ['report_all'],  # the names (strs) of functions that call print/open/input
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200', 'E9988', 'E9969', 'R0902', 'R1702', 'R0913']
-    }, output='pyta_report.html')
+    report_all()
+    # samples = load_user_sample()
+    # combine_tweets_for_sample([u.username for u in samples.most_popular], '500-pop')
+    # combine_tweets_for_sample([u.username for u in samples.random], '500-rand')
+    # combine_tweets_for_sample(samples.english_news, 'eng-news')
+
+    # tweets = load_combined_tweets('500-pop')
+    # print(len(tweets))
+    # view_covid_tweets_date(tweets)
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+import requests
+
+
+@dataclass
+class CasesData:
+    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+    cases: dict[str, float]
+    deaths: dict[str, float]
+
+
+def get_covid_cases_us() -> CasesData:
+    """
+    Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
+
+    :return: Cases data
+    """
+    url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
+    csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
+    data = CasesData(dict(), dict())
+
+    # Parse CSV
+    for line in csv:
+        split = line.split(',')
+        day, cases, deaths = split[0], split[2], split[6]
+        data.cases[day] = float(cases)
+        data.deaths[day] = float(deaths)
+    return data
@@ -1,25 +1,19 @@
-"""CSC110 Fall 2021 Project
+"""
 This module interacts directly with the Twitter API to download tweets and users.
 It contains functions related scraping users/tweets, including:
 - getting the tweets of a user
 - downloading many users by checking their followers and follower's followers, etc.
 """
-
-import json
 import math
-import os
 import random
 import time
-from typing import List, Union
+from typing import List

 import tweepy
 from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound

-import python_ta
-import python_ta.contracts
-
 from constants import TWEETS_DIR, USER_DIR
-from utils import Config, debug, calculate_rate_delay, write, json_stringify, read
+from utils import *


 def tweepy_login(conf: Config) -> tweepy.API:
@@ -63,15 +57,14 @@ def download_all_tweets(api: API, screen_name: str,
    Twitter API Reference
    --------
    It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
-    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get
-    -statuses-user_timeline)
+    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
    This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
    limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
    safe, this function uses a rate limit of 60 rpm.

    :param api: Tweepy API object
    :param screen_name: Screen name of that individual
-    :param download_if_exists: Whether to download if it already exists (Default: False)
+    :param download_if_exists: Whether or not to download if it already exists (Default: False)
    :return: None
    """
    # Ensure directories exist
@@ -129,10 +122,10 @@ def download_all_tweets(api: API, screen_name: str,

 def download_users_start(api: API, start_point: str, n: float = math.inf) -> None:
    """
-    This function downloads n Twitter users by using a friends-chain.
+    This function downloads n twitter users by using a friends-chain.

-    Since there isn't an API or a database with all Twitter users, we can't obtain a strict list
-    of all Twitter users, nor can we obtain a list of strictly random or most popular Twitter
+    Since there isn't an API or a database with all twitter users, we can't obtain a strict list
+    of all twitter users, nor can we obtain a list of strictly random or most popular twitter
    users. Therefore, we use the method of follows chaining: we start from a specific individual,
    obtain their followers, and pick 6 random individuals from the friends list. Then, we repeat
    the process for the selected friends: we pick 6 random friends of the 6 random friends
@@ -152,7 +145,7 @@ def download_users_start(api: API, start_point: str, n: float = math.inf) -> Non
    https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list)
    This will limit the rate of requests to 15 requests in a 15-minute window, which is one request
    per minute. But it is actually the fastest method of downloading a wide range of users on
-    Twitter because it can download a maximum of 200 users at a time while the API for downloading
+    twitter because it can download a maximum of 200 users at a time while the API for downloading
    a single user is limited to only 900 queries per 15, which is only 60 users per minute.

    There is another API endpoint that might do the job, which is api.twitter.com/friends/ids (Doc:
@@ -227,7 +220,7 @@ def download_users_execute(api: API, n: float,

    print("Executing friends-chain download:")
    print(f"- n: {n}")
-    print(f"- Requests per minute: {1}")
+    print(f"- Requests per minute: 1")
    print(f"- Directory: {USER_DIR}")
    print(f"- Downloaded: {len(downloaded)}")
    print(f"- Current search set: {len(current_set)}")
@@ -268,7 +261,6 @@ def download_users_execute(api: API, n: float,
        screen_names.sort(key=lambda x: x[1])

        # Add 3 random users to the next set
-        # python_ta thinks that u is not indexable but it is, because it is a tuple of length 2
        if len(screen_names) > 3:
            samples = {u[0] for u in random.sample(screen_names, 3)}
        else:
@@ -305,11 +297,12 @@ def download_users_execute(api: API, n: float,


 if __name__ == '__main__':
-    python_ta.contracts.check_all_contracts()
-    python_ta.check_all(config={
-        'extra-imports': ['json', 'math', 'os', 'random', 'time', 'typing', 'tweepy', 'constants',
-                          'utils'],  # the names (strs) of imported modules
-        'allowed-io': ['download_users_execute'],
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200', 'R0913', 'W0212']
-    }, output='pyta_report.html')
+    # python_ta.check_all(config={
+    #     'max-line-length': 100,
+    #     'disable': ['R1705', 'C0200', 'E9998', 'E9999']
+    # })
+
+    config = load_config('config.json5')
+    tweepy_api = tweepy_login(config)
+    # download_users_start(tweepy_api, 'sauricat')
+    download_users_resume_progress(tweepy_api)
@@ -1,10 +1,5 @@
-"""CSC110 Fall 2021 Project
-This module generates report HTML and serves it in an HTTP server.
-"""
-
 import json
 import os.path
-import shutil
 import traceback
 import webbrowser
 from distutils.dir_util import copy_tree
@@ -12,24 +7,21 @@ from pathlib import Path

 from flask import Flask, send_from_directory, Response

-import python_ta
-import python_ta.contracts
-
-from constants import REPORT_DIR, DEBUG, RES_DIR
+from constants import REPORT_DIR, DEBUG
 from utils import read, write

+# Constants
+src_dir = Path(os.path.realpath(__file__)).parent
+

 def generate_report() -> str:
    """
    Compile the report document and generate a markdown report

-    Preconditions:
-        - RES_DIR exists, and contains the necessary resources used in this project.
-
    :return: Markdown report
    """
    # Load markdown
-    md = read(os.path.join(RES_DIR, './report_document.md')).replace('\r\n', '\n').split('\n')
+    md = read(str(src_dir.joinpath('report_document.md'))).replace('\r\n', '\n').split('\n')

    # Process line by line
    for i in range(len(md)):
@@ -38,7 +30,6 @@ def generate_report() -> str:
            continue

        # Process @include statements
-        # noinspection PyBroadException
        try:
            path = line[line.index('`') + 1:]
            path = path[:path.index('`')]
@@ -47,7 +38,7 @@ def generate_report() -> str:
            # Cut lines
            # Format: @include-cut `path` <start, inclusive> [end, not inclusive]
            if line.startswith('@include-cut'):
-                args = [int(j) for j in line.split()[2:]]
+                args = [int(i) for i in line.split()[2:]]
                if len(args) == 1:
                    md[i] = '\n'.join(md[i].split('\n')[args[0]:])
                if len(args) == 2:
@@ -57,14 +48,14 @@ def generate_report() -> str:
            # Format: @include-lines `path` <...lines>
            # Example: @include-lines `path` 1 2 5
            if line.startswith('@include-lines'):
-                args = [int(j) for j in line.split()[2:]]
+                args = [int(i) for i in line.split()[2:]]
                lines = md[i].split('\n')
                lines = [lines[ln] for ln in range(len(lines)) if ln in args]
                md[i] = '\n'.join(lines)

        # Handle errors. (It prompts "too broad an exception clause" but I actually need to catch
        # every possible exception.)
-        except Exception:
+        except Exception as e:
            md[i] = f"<pre class=\"error\">" \
                    f"\nInvalid @include statement. \n{traceback.format_exc()}</pre>"

@@ -77,10 +68,10 @@ def generate_html() -> str:

    :return: HTML string
    """
-    # Generate markdown report and JSON encode it (which works as JS code! amazing)
+    # Generate markdown report and JSON encode it (which works as JS code! amazing
    md_json = json.dumps({'content': generate_report()})
    # Inject into HTML
-    html = read(os.path.join(RES_DIR, 'report_page.html')) \
+    html = read(str(src_dir.joinpath('report_page.html'))) \
        .replace('`{{markdown}}`', md_json)
    return html

@@ -92,11 +83,11 @@ def write_html() -> None:
    :return: None
    """
    if os.path.isdir('./dist'):
-        shutil.rmtree('./dist')
-    Path('./dist/html').mkdir(parents=True, exist_ok=True)
+        os.remove('./dist')
+    Path('./dist/resources').mkdir(parents=True, exist_ok=True)
    write('./dist/index.html', generate_html())

-    copy_tree(os.path.join(RES_DIR, 'html/'), './dist/html')
+    copy_tree(str(src_dir.joinpath('resources/').absolute()), './dist/resources')
    copy_tree(REPORT_DIR, './dist')


@@ -126,14 +117,14 @@ def serve_report() -> None:
    @app.route('/<path:path>')
    def res(path: str) -> Response:
        """
-        Resources endpoint. This function maps report queries to the report directory
+        Resources endpoint. This maps report queries to the report directory

        :param path: Path of the resource
        :return: File resource or 404
        """
        return send_from_directory(Path(REPORT_DIR).absolute(), path)

-    @app.route('/html/<path:path>')
+    @app.route('/resources/<path:path>')
    def js_res(path: str) -> Response:
        """
        JS Resource endpoint. This maps JS and CSS queries to the resources directory
@@ -141,7 +132,7 @@ def serve_report() -> None:
        :param path: Path of the resource
        :return: File resource or 404
        """
-        return send_from_directory(os.path.join(RES_DIR, 'html'), path)
+        return send_from_directory(os.path.join(src_dir, 'resources'), path)

    # Run app
    webbrowser.open("http://localhost:8080")
@@ -149,12 +140,5 @@ def serve_report() -> None:


 if __name__ == '__main__':
-    python_ta.contracts.check_all_contracts()
-    python_ta.check_all(config={
-        'extra-imports': ['json', 'os.path', 'shutil', 'traceback', 'webbrowser',
-                          'distutils.dir_util', 'pathlib', 'flask', 'constants', 'utils'
-                          ],  # the names (strs) of imported modules
-        'allowed-io': [],  # the names (strs) of functions that call print/open/input
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200', 'R1702', 'W0703']
-    })
+    write_html()
+    serve_report()
@@ -25,7 +25,7 @@ We also counted the number of people speaking each language:

 2. We also downloaded all tweets from our sampled users through the user-timeline API [(documentation)](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline). Due to rate limiting, the program took around 16 hours to finish, and we obtained 7.7 GB of raw data (uncompressed). During processing, for each tweet, we extracted only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory `data/twitter/user-tweets/processed/` is 141.6 MB in total.

-3. We also used the COVID-19 daily cases data published by New York Times [[3]](#ref3) to compare with peaks and through in our frequency over date graph.
+3. We also used the COVID-19 daily cases data published by New York Times [[3]](#ref3) to compare with peaks and throughs in our frequency over date graph.

 ## Computation & Filtering

@@ -80,7 +80,7 @@ $$ \text{pop_ratio}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{P
 After calculation, `freqs` and `pops` are plotted in line graphs against `dates`. Initially, we are seeing graphs with very high peaks such as the graph below. After some investigation, we found that these peaks are caused by not having enough tweets on each day to average out the random error of one single popular tweet. For example, in the graph below, we adjusted the program to print different users' popularity ratios when we found an average popularity ratio of greater than 20, which produced the output on the right. As it turns out, on 2020-07-11, the user @juniorbachchan posted that he and his father tested positive, and that single post is 163.84 times more popular than the average of all his posts. (The post is linked [here](https://twitter.com/juniorbachchan/status/1282018653215395840), it has 235k likes, 25k comments, and 32k retweets). Even though these data points are outliers, there isn't an effective way of removing them since we don't have enough tweets data from each user to calculate their range (for example, someone's COVID-related post might be the only one they've posted). So, we've decided to limit the viewing window to `y = [0, 2]` as shown in the graph on the right.

 <div class="image-row">
-    <div><img src="html/peak-1.png" alt="graph"></div>
+    <div><img src="resources/peak-1.png" alt="graph"></div>
    <div style="display: flex; flex-direction: column; justify-content: center"><pre>
 Date:  2020-07-11
 - JoeBiden 1.36
@@ -90,7 +90,7 @@ Date:  2020-07-11
 - gucci 0.13
 - StephenKing 0.61
    </pre></div>
-    <div><img src="html/peak-2.png" alt="graph"></div>
+    <div><img src="resources/peak-2.png" alt="graph"></div>
 </div>

 Then, we encountered the issue of noise. When we plot the graph without a filter, we found that the graph is actually very noisy. We decided to average the results over 7 days. Then, we also experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
@@ -187,11 +187,16 @@ In summary, key findings in our research include that while news channels post a

 These findings might not be surprising, but they might have again demonstrated people's ability to adapt to new environments. The duration of the sensational effect of the start of COVID-19 might be similar to the grief from losing something important, they all fade over time as we adapt to them. Even though people focused a lot of attention on COVID-19 when new information first became available from March 2020, people's interest in these topics decreased as we adapt to the new norm with COVID-19 in three months, demonstrated by the quickly decreasing posting rates. Or, for the audience, rather than liking or commenting on COVID-19 posts, they might have quickly scrolled through them in favor of more interesting posts. It is fascinating that we can learn to adapt to such a devastating change in our environment in only three months.

+## TODO
+
+* [ ] Frequency/time: Maybe there's a reason to the May 2021 peak?
+* [ ] Followers (x) vs COVID-related posts (y) scatter plot, each point is a user
+
 # References

 <a id="ref1"></a>

-[1] Bremmen, N. (2010, September 3). The 100 most influential news media Twitter accounts. _Memeburn_. Retrieved November 27, 2021, from https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/.
+[1] Bremmen, N. (2010, September 3). The 100 most influential news media twitter accounts. _Memeburn_. Retrieved November 27, 2021, from https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/.

 <a id="ref2"></a>

@@ -3,16 +3,16 @@
 <head>
    <meta charset="UTF-8">
    <title>CSC110 Report</title>
-    <link rel="stylesheet" href="html/style.css">
+    <link rel="stylesheet" href="resources/style.css">
 </head>
 <body>
 <div id="content">

 </div>
-<script src="html/marked.min.js"></script>
-<script src="html/jquery.min.js"></script>
-<script src="html/polyfill.es6.min.js"></script>
-<script src="html/mathjax-tex-mml-chtml.js"></script>
+<script src="resources/marked.min.js"></script>
+<script src="resources/jquery.min.js"></script>
+<script src="resources/polyfill.es6.min.js"></script>
+<script src="resources/mathjax-tex-mml-chtml.js"></script>

 <script>

@@ -0,0 +1,45 @@
+from process.twitter_process import *
+from raw_collect.twitter import *
+from utils import *
+
+if __name__ == '__main__':
+    # conf = load_config('config.json5')
+    # api = tweepy_login(conf)
+    # print(json_stringify(api.get_user(screen_name="sauricat")._json, indent=2))
+
+    # keywords = '⚧; mtf; ftm; transgender; 药娘; 🍥; they/them'.split('; ')
+    #
+    # base_dir = './data/twitter/user'
+    #
+    # users = []
+    #
+    # # for f in ['NASAspaceplace.json']:
+    # for f in os.listdir(f'{base_dir}/users'):
+    #     s = read(f'{base_dir}/users/{f}')
+    #     j = json.loads(s)
+    #     s = ''.join(j[k] for k in ['name', 'description'])
+    #     if any(k in s.lower() for k in keywords):
+    #         # print([k in s.lower() for k in keywords])
+    #         print(f)
+    #         users.append((j['screen_name'], j['name'], j['description'], j['followers_count']))
+    #
+    # write('trans.json', json_stringify(users, 2))
+    # print(len(users))
+    # time.sleep(5)
+
+    # print(get_user_popularity_ranking('danieltosh'))
+
+    # for f in os.listdir(f'{USER_DIR}/users'):
+    #     os.rename(f, f.lower())
+
+    # combine_tweets_for_sample(['abc', 'wsj'], 'test')
+
+    start = time.time()
+    for i in range(1000000):
+        dateutil.parser.isoparse('2020-01-01T01:01:01')
+    print(f'dateutil.parser.isoparse took {time.time() - start:.2f} seconds')
+
+    start = time.time()
+    for i in range(1000000):
+        parse_date('2020-01-01T01:01:01')
+    print(f'parse_date took {time.time() - start:.2f} seconds')
@@ -1,28 +1,22 @@
-"""CSC110 Fall 2021 Project
-This module contains useful functions and classes, including:
+"""This module contains useful functions and classes, including:
 - debug messages
 - file I/O
 - statistics functions, removing outliers and averaging values over a period
 - date-related functions
- classes for configs, reports, statistics, and JSON
-"""
+- classes for configs, reports, statistics, and JSON"""

 import dataclasses
-import doctest
 import inspect
 import json
 import os
 import statistics
-import math  # python_ta complains about unused import but it's used in a doctest
 from dataclasses import dataclass
 from datetime import datetime, date, timedelta
 from pathlib import Path
-from typing import Union, Any, Generator
+from typing import Union, NamedTuple, Any, Generator

 import json5
 import numpy as np
-import python_ta
-import python_ta.contracts
 from tabulate import tabulate

 from constants import REPORT_DIR, DEBUG
@@ -34,16 +28,16 @@ class Config:
    Secrets configuration for this program.

    Attributes:
-        - consumer_key: The consumer key from the Twitter application portal
-        - consumer_secret: The consumer secret from the Twitter application portal
-        - access_token: The access token of an app from the Twitter application portal
-        - access_secret: The access secret of an app from the Twitter application portal
+      - consumer_key: The consumer key from the Twitter application portal
+      - consumer_secret: The consumer secret from the Twitter application portal
+      - access_token: The access token of an app from the Twitter application portal
+      - access_secret: The access secret of an app from the Twitter application portal

    Representation Invariants:
-        - self.consumer_key != ''
-        - self.consumer_secret != ''
-        - self.access_token != ''
-        - self.access_secret != ''
+      - self.consumer_key != ''
+      - self.consumer_secret != ''
+      - self.access_token != ''
+      - self.access_secret != ''
    """
    # Twitter's official API v1 keys
    consumer_key: str
@@ -54,8 +48,7 @@ class Config:

 def load_config(path: str = 'config.json5') -> Config:
    """
-    Load config using JSON5, from either the local file ~/config.json5 or from the environment
-    variable named config.
+    Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.

    :param path: Path of the config file (Default: config.json5)
    :return: Config object
@@ -94,9 +87,6 @@ def write(file: str, text: str) -> None:
    """
    Write text to a file

-    Preconditions:
-        - file != ''
-
    :param file: File path (will be converted to lowercase)
    :param text: Text
    :return: None
@@ -114,9 +104,6 @@ def read(file: str) -> str:
    """
    Read file content

-    Preconditions:
-        - file != ''
-
    :param file: File path (will be converted to lowercase)
    :return: None
    """
@@ -129,11 +116,8 @@ class Reporter:
    Report file creator

    Attributes:
-        - report: The string of the report
-        - file: Where the report is stored
-
-    Representation Invariants:
-        - self.file != ''
+      - report: The string of the report
+      - file: Where the report is stored
    """
    report: str
    file: str
@@ -159,11 +143,6 @@ class Reporter:
            self.save()

    def save(self) -> None:
-        """
-        Save the report to the file
-
-        :return: None
-        """
        write(self.file, self.report)

    def table(self, table: list[list[str]], headers: list[str], header_code: bool = False) -> None:
@@ -186,11 +165,8 @@ def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float

    Credit to: https://stackoverflow.com/a/11886564/7346633

-    Preconditions:
-        - len(points) > 0
-
    :param points: Input points list
-    :param z_threshold: Z threshold for identifying whether a point is an outlier
+    :param z_threshold: Z threshold for identifying whether or not a point is an outlier
    :return: List with outliers removed
    """
    x = np.array(points)
@@ -214,12 +190,12 @@ class Stats:
    Data class storing the statistics of a sample

    Attributes:
-        - mean: The average of the sample
-        - stddev: The standard deviation
-        - median: The median value of the sample, or the 50th percentile
-        - iqr: The interquartile-range (75th percentile - 25th percentile)
-        - q25: The first quartile, or the 25th percentile
-        - q75: The third quartile, or the 75th percentile
+      - mean: The average of the sample
+      - stddev: The standard deviation
+      - median: The median value of the sample, or the 50th percentile
+      - iqr: The interquartile-range (75th percentile - 25th percentile)
+      - q25: The first quartile, or the 25th percentile
+      - q75: The third quartile, or the 75th percentile
    """
    mean: float
    stddev: float
@@ -233,9 +209,6 @@ def get_statistics(points: list[float]) -> Stats:
    """
    Calculate statistics for a set of points

-    Preconditions:
-        - len(points) > 0
-
    :param points: Input points
    :return: Statistics
    """
@@ -253,7 +226,6 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
    :param percent: Whether the numbers are percentages
    :return: Table for tabulate
    """
-
    def num(n: float) -> str:
        return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'

@@ -272,8 +244,8 @@ def parse_date_time(iso: str) -> datetime:
    python's built-in dateutil.parser.isoparse() function.

    Preconditions:
-        - iso is the output of datetime.isoformat() (In a format like "2021-10-20T23:50:14")
-        - iso is a valid date (this function does not check for the validity of the input)
+      - iso is the output of datetime.isoformat() (In a format like "2021-10-20T23:50:14")
+      - iso is a valid date (this function does not check for the validity of the input)

    :param iso: Input date
    :return: Datetime object
@@ -287,8 +259,8 @@ def parse_date_only(iso: str) -> datetime:
    Parse date faster.

    Preconditions:
-        - iso starts with the format of "YYYY-MM-DD" (e.g. "2021-10-20" or "2021-10-20T10:04:14")
-        - iso is a valid date (this function does not check for the validity of the input)
+      - iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20")
+      - iso is a valid date (this function does not check for the validity of the input)

    :param iso: Input date
    :return: Datetime object
@@ -300,12 +272,8 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
    """
    Date range for looping, excluding the end date

-    Preconditions:
-        - start_date starts with the "YYYY-MM-DD" format
-        - end_date starts with the "YYYY-MM-DD" format
-
    :param start_date: Start date in "YYYY-MM-DD" format
-    :param end_date: Ending date in "YYYY-MM-DD" format
+    :param end_date: End date in "YYYY-MM-DD" format
    :return: Generator for looping through the dates one day at a time.
    """
    start = parse_date_only(start_date)
@@ -317,12 +285,12 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
 def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
                 default: float = 0) -> list[float]:
    """
-    Takes y-axis data in the form of a mapping of dates to values, and returns a list of all the
+    Takes y-axis data in the form of a mapping of date to values, and returns a list of all the
    values mapped to the date in dates. If a date in dates isn't in y, then the default values is
    used instead.

    Preconditions:
-        - The date in dates must be in the same format as the dates in the keys of y
+      - The date in dates must be in the same format as the dates in the keys of y

    :param y: Y axis data (in the format y[date] = value)
    :param dates: Dates
@@ -334,21 +302,11 @@ def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],

 def filter_days_avg(y: list[float], n: int) -> list[float]:
    """
-    Filter y by taking an average over an n-days window. If n = 0, then return y without processing.
+    Filter y by taking an average over a n-days window. If n = 0, then return y without processing.

-    Preconditions:
-        - n % 2 == 1
-        - len(y) > 0
-
-    >>> actual = filter_days_avg(list(range(10)), 3)
-    >>> expected = [1/3, 1, 2, 3, 4, 5, 6, 7, 8, 26/3]
-    >>> all(math.isclose(actual[i], expected[i]) for i in range(10))
-    True
-
-    >>> actual = filter_days_avg(list(range(10)), 5)
-    >>> expected = [0.6, 1.2, 2, 3, 4, 5, 6, 7, 7.8, 8.4]
-    >>> all(math.isclose(actual[i], expected[i]) for i in range(10))
-    True
+    Precondition:
+      - n % 2 == 1
+      - len(y) > 0

    :param y: Values
    :param n: Number of days, must be odd
@@ -369,18 +327,13 @@ def filter_days_avg(y: list[float], n: int) -> list[float]:
        current_sum += y[i]  # adding the values in y[1:r + 1]

    ret = []
-    # python_ta says "unnecessary indexing" because the index only accesses items from 1 list.
-    # But a look at the code tells you that iterating is not sufficient for the sliding window,
-    # random access is actually better, because prior elements need to be accessed as well, and
-    # using a queue seems too silly. The other option is to literally extend the array but that
-    # takes extra space instead of O(1) space, so why do that?
    for i in range(len(y)):
-        left, right = i - radius, i + radius
-        left = max(0, left)  # avoid index out of bounds by "extending" first/last element
-        right = min(right, len(y) - 1)
-        current_sum += y[right]  # extend sliding window
+        l, r = i - radius, i + radius
+        l = max(0, l)  # avoid index out of bounds by "extending" first/last element
+        r = min(r, len(y) - 1)
+        current_sum += y[r]  # extend sliding window
        ret.append(current_sum / n)
-        current_sum -= y[left]  # remove old values
+        current_sum -= y[l]  # remove old values
    return ret


@@ -389,7 +342,7 @@ def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float
    Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)

    Preconditions:
-        - len(numerator) == len(denominator)
+      - len(numerator) == len(denominator)

    :param numerator: Numerator
    :param denominator: Denominator
@@ -401,19 +354,13 @@ def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float
            output[i] = 0
        else:
            output[i] = numerator[i] / denominator[i]
-    # This marks it as incorrect type, but it's actually not incorrect type, just because numpy
+    # This marks it as incorrect type but it's actually not incorrect type, just because numpy
    # doesn't specify its return types
-    # noinspection PyTypeChecker
    return output.tolist()


 class EnhancedJSONEncoder(json.JSONEncoder):
-    """
-    An improvement to the json.JSONEncoder class, which supports:
-    encoding for dataclasses, encoding for datetime, and sets
-    """
-
-    def default(self, o: object) -> object:
+    def default(self, o):

        # Support encoding dataclasses
        # https://stackoverflow.com/a/51286749/7346633
@@ -432,29 +379,13 @@ class EnhancedJSONEncoder(json.JSONEncoder):
        return super().default(o)


-def json_stringify(obj: object, indent: Union[int, None] = None) -> str:
+def json_stringify(obj, indent: Union[int, None] = None) -> str:
    """
    Serialize json string with support for dataclasses and datetime and sets and with custom
    configuration.

-    Preconditions:
-        - obj != None
-
    :param obj: Objects
    :param indent: Indent size or none
    :return: Json strings
    """
    return json.dumps(obj, indent=indent, cls=EnhancedJSONEncoder, ensure_ascii=False)
-
-
-if __name__ == '__main__':
-    doctest.testmod()
-    # python_ta.contracts.check_all_contracts()
-    python_ta.check_all(config={
-        'extra-imports': ['dataclasses', 'doctest', 'inspect', 'json', 'os', 'statistics', 'math',
-                          'datetime', 'pathlib', 'typing', 'json5', 'numpy', 'tabulate',
-                          'constants'],  # the names (strs) of imported modules
-        'allowed-io': ['load_config', 'write', 'debug', 'read'],
-        'max-line-length': 100,
-        'disable': ['R1705', 'C0200', 'E9994', 'W0611']
-    }, output='pyta_report.html')
@@ -30,103 +30,52 @@ sorting=nyt
    \indent

    \begin{itemize}
-        \item[1.] A wide range of Twitter users: We used twitter's get friends list API (13) and the follows-chaining technique to obtain a wide range of twitter users. This technique is explained in the Computational Overview section. Due to rate limiting, we ran the program for one day and obtained 224,619 users (852.3 MB decompressed). However, only the username, popularity, post count, and language data are used, and the processed (filtered) user dataset \C{data/twitter/user/processed/users.json} is only 7.9 MB in total.
+        \item[1.] A wide range of Twitter users: We used twitter's get friends list API \href{https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list}{(documentation)} and the follows-chaining technique to obtain a wide range of twitter users. This technique is explained in the Computational Overview section. Due to rate limiting, we ran the program for one day and obtained 224,619 users (852.3 MB decompressed). However, only the username, popularity, post count, and language data are used, and the processed (filtered) user dataset \C{data/twitter/user/processed/users.json} is only 7.9 MB in total.
         
-        \item[2.] All tweets from sampled users: We selected two samples of 500 users each (the sampling method is explained in the Computational Overview section), and we used the user-timeline API (14) to obtain all of their tweets. Due to rate limiting, the program took around 16 hours to finish, and we obtained 6.07 GB of raw data (uncompressed). During processing, we reduced the data for each tweet to only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory \C{data/twitter/user-tweets/processed} is only 107.9 MB in total.
+        \item[2.] All tweets from sampled users: We selected two samples of 500 users each (the sampling method is explained in the Computational Overview section), and we used the user-timeline API \href{https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline}{(documentation)} to obtain all of their tweets. Due to rate limiting, the program took around 16 hours to finish, and we obtained 6.07 GB of raw data (uncompressed). During processing, we reduced the data for each tweet to only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory \C{data/twitter/user-tweets/processed} is only 107.9 MB in total.
        
-        \item[3.] Top 100 news twitter accounts by Bremmen (3)
-        \item[4.] COVID-19 daily new cases data by New York Times (1). 
+        \item[3.] Top 100 news twitter accounts by Bremmen (https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/)
+        \item[4.] COVID-19 daily new cases data by New York Times (https://github.com/nytimes/covid-19-data). 
    \end{itemize}
 
    \section{Computational Overview}

-    \subsection*{Data Gathering \& Processing}
+    \subsection*{Data Gathering}
    \indent

-    This section explains the data gathering and processing done in \verb|collect_twitter.py|, \verb|collect_others.py|, and \verb|processing.py|. In this section, raw data will be collected and processed into the \verb|processed_data.7z| that we provided.
+    However, since twitter limited the request rate of this API endpoint to 1 request ($\le 200$ users) per minute, we ran the program continuously for one day to gather this data.

-    To create our samples, we collected a wide range of Twitter users using Twitter's get friends list API endpoint through \textbf{tweepy}, using the follows-chaining technique. We specified one single user as the starting point (in this case, we picked \verb|voxdotcom|). The program then obtains the user's friends list, picks 3 random users and 3 most followed users from the friend list, adds them to the queue, and starts the downloading process again from each of the six friends. Because of Twitter's rate limiting on the get friends list endpoint, we can only obtain a maximum of 200 users per minute, with many of them being duplicates. We ran the program continuously for one day and obtained 224,619 users (852.3 MB decompressed). However, only the username, popularity, post count, and language data are kept after processing (filtering). The processed user dataset \verb|data/twitter/user/processed/users.json| is 7.9 MB in total. We selected our samples by filtering the results first based on language, selected the top 500 most followed users as 500-pop, filtered the list again based on post count (>1000) and followers (>150), then selected a random sample of 500 users as 500-rand.
+    We plan to transform different platforms’ user posting data, all with unique formats, into data in a platform-independent data model to store and compare. When processing social media data, we will convert platform-dependent keywords such as \texttt{favorites}, \texttt{retweets}, or \texttt{full\_text} on Twitter and \texttt{content}, \texttt{views}, or \texttt{comments} on Telegram into our unique platform-independent model with keywords such as \texttt{popularity} and \texttt{text}. And we will store all processed data in \textbf{JSON} before analysis. As for the raw data from different social media platforms, we plan to gather Twitter data using the \textbf{Tweepy} library and Telegram channels data using \textbf{python-telegram-bot}. However, unfortunately, there are no known libraries for Wechat Moments. We will try to obtain Wechat data through package capture using pyshark, but that might not be successful.

-    We also downloaded all tweets from our sampled users through the user-timeline API also with \textbf{tweepy}. Due to rate limiting, the program took around 16 hours to finish, and we obtained 7.7 GB of raw data (uncompressed). During processing, for each tweet, we extracted only its date, popularity (likes + retweets), whether it is a retweet, and whether it is related to the COVID-19 pandemic. The text of the tweets are not retained, and the processed data directory \verb|data/twitter/user-tweets/processed/| is 141.6 MB in total.
+    For news outlet data, we plan to use \textbf{requests} to obtain raw HTML from different listing sites, extract news articles’ titles, publishers, and publishing dates with \textbf{regex}, and store them using JSON. We will convert different HTML formats from different news publishers’ sites into our platform-independent news model.

-    To determine whether a post is COVID-related we used keyword matching with three lists of COVID-related keywords for English, Chinese, and Japanese. Tweets with content containing these keywords are marked as COVID-related.
+    We also use the \textbf{Json5} library to parse configurations and API keys of our data gathering and analysis programs.

-    We also used the COVID-19 daily cases data published by New York Times to compare with peaks and through in our frequency over date graph, and the program gathered this data by sending an HTTP requst to New York Times' public github repository using \textbf{requests} and then parsing the CSV.
-
-    For submission, we packed the processed data into a 7zip archive using \textbf{py7zr}. This is necessary because our processed data are placed very close to the raw data in the folder structure, and creating the archive manually requires separating the processed data from the raw data into two separate folders first. We also used py7zr to pack our HTML resources.
-
-    We also used \textbf{json5} to store the configuration of this program, which contains Twitter API keys.
-
-    \subsection*{Statistical Visualization Generation}
+    \subsection*{Data Analysis/Visualization}
    \indent

-    This section explains the statistical report generation done in \verb|visualization.py|. In this section, specific "elements" used in our report are generated. For example, an element might be an image of the user frequency graph in one of our samples, and another element might be a markdown table showing the amount of users who posted less than 1\% or didn't post in our samples. Each element is stored in a separate file, which will be included in the visualized report explained in the next section.
+    We plan to use \textbf{matplotlib} to create data images or \textbf{plotly} to create websites for data visualization. We plan to use \textbf{NumPy} for statistical calculations.

-    Since the statistical computations of report generation is explained in the interactive report, this section will only focus on the technical aspect of which libraries we used to complete these computations and generate the statistical elements of the report.
+    To identify whether or not some article is about COVID, we currently use a keyword search. However, a keyword search might not be accurate when COVID has became such an essential background to our society (i.e. many articles with the word COVID in them are about something else). We might experiment with training a binary classification model with \textbf{Keras} and \textbf{scikit-learn} to better classify COVID articles. We might also experiment with training autoencoders with vectorized word occurence data in an COVID-related article to find if there are significant categories within COVID articles (i.e. some COVID articles might be about new COVID policies, and others might just be general updates relating to COVID, and this might be an important insight because people's interests in these different types of COVID articles might differ).

-    We used \textbf{matplotlib} to generate images that will be displayed in our report, including histograms and line graphs. We used \textbf{scipy} for signal filtering and smoothening the curves so that they are readable (specifically, \verb|scipy.signal.lfilter|). We used \textbf{numpy} in our statistical calculations to calculate percentile points and remove outliers. We then used \textbf{tabulate} to generate Markdown format tables for report elements.
+    The primary type of graph we will use will be a frequency histogram——an individual or a group of data’s frequency of mentioning COVID-related topics will be graphed against the date from January 1, 2020, to Nov 1, 2021. We will experiment with group sizes and classification methods to find which variables influence the frequency and which don’t. (For example, we will group individuals by popularity and compare between groups to find if popularity impacts the frequency they mention COVID-related topics). We also plan to overlay these charts in comparison to visualize the statistical differences better.

-    \subsection*{Interactive Report Generation}
-    \indent
-
-    This section explains the interactive report creation in \verb|report.py|.
-
-    We wrote our report in Markdown format, located in \verb|resources/report_document.md|. However, the default Markdown format doesn't support including the contents of other markdown files generated in the previous step, so we extended the markdown format by adding \verb|@include|, \verb|@include-lines|, and \verb|@include-cut| functionality.
-
-    Then, to display the markdown in a webpage, we created a template HTML (\verb|resources/report_page.html|) and used python to inject the markdown content into the HTML template. Then, we used \textbf{Marked} (a JS library) to render the Markdown to the webpage. We did not use the python Markdown library because it did not support the Github Markdown table format generated with \textbf{tabulate} in the previous step. Then, we used the \textbf{Flask} framework to serve the webpage along with the referenced assets like images, js, and fonts on an HTTP server.
-
-    On the webpage, we used \textbf{jQuery} (a JS library) to make the images enlargeable. We also imported \textbf{MathJax} (a JS library) to automatically render LaTeX on the webpage (no code needed to reference this library).
-
-    Even though the handout required the project to be purely written in Python, instructors in Piazza allowed us to use web languages as long as all data gathering, processing, computation, visualization, and image rendering are done in pure Python (@1704).
+    Another variant of the frequency histogram will be plotted not against the date but against the country’s confirmed cases since people’s emotions of anxiety might be influenced by the growing or decreasing of confirmed cases. We will also graph some data using this variant to find more insights.

    \section{Running Instructions}
    \indent

-    \begin{itemize}
-        \item [1. ] Download the submitted files into a new folder called \verb|src|.
-        
-        \item [2. ] Extract the archive \verb|src/resources.7z| into the folder \verb|src/resources|.
-        
-        \item [3. ] Install \verb|src/requirements.txt|, either with PyCharm or with \texttt{pip install -r src/requirements.txt}.
-        
-        \item [4. ] If you would like to test out our data collection code or collect data manually, do the following: (We do not recommend collecting all data manually because it took the program two days to gather our data due to rate limiting)
-        
-        \begin{itemize}
-            \item [a. ] Register for Twitter API keys on their website. For more information, look at the following link: \href{https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api}{(Getting access to the Twitter API)}.
-            \item [b. ] Copy the Twitter API keys into the corresponding fields in \verb|src/config.json5|
-            \item [c. ] In \verb|src/main.py|, uncomment all the lines of code for data collection and processing: that is, the steps C1.0-C1.2, P1-P2, C2.1-C2.3, P3.
-        \end{itemize}
-        
-        \item [5. ] If you would like to use our processed data, you can download the archive from \url{https://send.utoronto.ca} with the following code, which will expire on December 27.\\
-        Claim ID: 6PPMsHQNTV7TJRmu\\
-        Claim Passcode: 9VPba4YiYx2cetbU\\
-        Alternatively, you can download it from a permanent link: \url{https://csc110.hydev.org/processed-data.7z}\\
-        Extract the archive into a directory called \verb|data| at the same level as \verb|src|, that is, \verb|data| and \verb|src| should be in the same folder.\\
-        The file \verb|src/constants.py| contains a more detailed directory tree.
-        
-        \item [6. ] Run \verb|src/main.py|, either in PyCharm or with \texttt{python3 main.py}. Note that the execution directory should be in \verb|src| and not the root directory. If you use PyCharm, you should open \verb|src| in PyCharm instead of the root directory. This file structure is intentionally designed to prevent PyCharm from indexing \verb|data|, which takes an extremely long time.
-        
-    \end{itemize}
+    TODO

    \section{Changes to Proposal}
    \indent

-    First, we originally planned to include news reports from separate journal websites in our analysis as well. However, when we gather the data, we found that there is no way to identify the popularity of a news report published on a journal website. So, we decided to gather the tweets of news accounts on Twitter instead, which will also have the benefit of having the same data gathering and analysis process for each news channel.
+    First, we originally planned to include news reports from separte journal websites in our analysis as well. However, when we gather the data, we found that there is no way to identify the popularity of a news report published on a journal website. So, we decided to gather the tweets of news accounts on Twitter instead, which will also have the benefit of having the same data gathering and analysis process for each news channel.

-    Second, we originally planned to compare people's interests in posting COVID-related topics between different platforms because we thought Chinese people don't rely on Twitter as much since Twitter is blocked in China. However, there isn't any publically available WeChat API that we can use for analysis, and WeChat is also more private, with access to someone's postings limited to only their friends. (It is as if everyone on Twitter has a locked account). Therefore, it is impractical to gather data from WeChat. And, for Telegram channels, the postings does not have a like feature, and might not have commenting feature unless the channel host specifically set up for it using a third-party bot. So there isn't a reliable way to obtain popularity data on Telegram as well. So, instead of comparing between platforms, we compared different groups of people on the Twitter platform.
+    Second, we originally planned to compare people's interests in posting COVID-related topics between different platforms because we thought Chinese people don't rely on Twitter as much since Twitter is blocked in China. However, there isn't any publically available Wechat API that we can use for analysis, and Wechat is also more private, with access to someone's postings limited to only their friends. (It is as if everyone on Twitter has a locked account). Therefore, it is impractical to gather data from Wechat. And, for Telegram channels, the postings does not have a like feature, and might not have commenting feature unless the channel host specifically set up for it using a third-party bot. So there isn't a reliable way to obtain popularity data on Telegram as well. So, instead of comparing between platforms, we compared different groups of people on the Twitter platform.

    \section{Discussion}
    \indent

-    To try to answer our research questions, we made 3 user samples: \texttt{500-pop}, composed of the 500 most popular users on Twitter, \texttt{500-rand}, 500 random users on Twitter, and \texttt{eng-news}, the top 100 English news channels on Twitter (3).
-
-    Our first research question asks: \textbf{how frequently do people post about COVID-related issues, and how interested are people to see COVID-related posts?}
-    After making graphs showing the frequency of COVID-related posts, the histograms showing frequency all tend to be skewed right, or decreasing. In all 3 samples of \texttt{500-pop}, \texttt{500-rand}, and \texttt{eng-news}, for every user, the majority of their posts were not COVID-related. However, news channels tended to post more about COVID than \texttt{500-pop}, who tended to post more about COVID than \texttt{500-rand}. This suggests that the \texttt{500-pop} users tend to post more than the average person about COVID, possibly because they have a large influence and want to share their opinion, but they post less about COVID than news channels, who have to report the latest news about the pandemic, regulations, and slowing the spread.
-    The histograms showing the popularity ratios are also skewed right, except the news, which is almost centered. This shows that for COVID-posts outside of news channels, they usually receive poor engagement. However, for news channels, they receive almost as much engagement as their other posts, suggesting that users are indifferent to COVID-news as regular news.
-    
-    Our second research question tackled the same thing: but instead, how it changed over time. From the line graphs showing COVID-posting frequency over time, it tends to decrease for the samples of \texttt{500-rand} and \texttt{eng-news}, although \texttt{500-pop} and \texttt{eng-news} shared a peak around December 2020 and trough around June 2021, which seems to be related to the rise and fall of COVID cases in the US. A bit surprisingly, the news has stopped posting as many COVID-related posts, maybe because they are starting to incorporate other keywords such as "Delta variant". In addition, there is a large spike in May 2021 for the sample \texttt{500-pop}, which is also around when the Delta variant started appearing. This could mean that the news started talking about that more, while the popular twitter users talked about its consequences. However, the \texttt{500-rand} sample has seen a decrease in frequency, suggesting that they post less about it, possibly because their COVID posts get less engagement, or just a general decrease in the interest in COVID as a topic. We speculate it can possibly be people getting used to seeing COVID-related news all the time, and just like background noise, are starting to ignore it (a process called habituation).
-    The line graphs showing popularity ratio over time, however, were a lot messier. Despite attempts to filter out the noise, there were constants spikes in the graph throughout the entire graph, making it difficult to draw any conclusions. This could possibly be because the sample size is too small, or the method of determining popularity isn't sophisticated enough and prone to random noise. Unfortunately, this means that further research is required to answer our question.    
-    
-\nocite{*}
-\printbibliography
+    TODO
 \end{document}
@@ -1,16 +0,0 @@
-@misc{matplotlib, title={Overview}, url={https://matplotlib.org/stable/index.html}, journal={Overview - Matplotlib 3.5.0 documentation}, author={Hunter, John and Droettboom , Michael and Firing, Eric and Dale, Darren}, year={2021}, month={Aug}}
-@misc{json5, title={JSON5}, url={https://pypi.org/project/json5/}, journal={PyPI}, author={Pranke, Dirke}, year={2021}}
-@misc{tweepy, title={Tweepy documentation}, url={https://docs.tweepy.org/en/stable/}, journal={Tweepy Documentation - tweepy 4.3.0 documentation}, author={Roesslein, Joshua}, year={2021}}
-@misc{numpy, title={NumPy v1.21 manual}, url={https://numpy.org/doc/stable/}, journal={Overview - NumPy v1.21 Manual}, author={Numpy}, year={2021}}
-
-@misc{tabulate, title={Tabulate}, url={https://pypi.org/project/tabulate/}, journal={PyPI}, author={Astanin, Sergey and Crespí, Pau Tallada and Marsi, Erwin and Kocikowski, Mik and Ryder, Bill and Dwiel, Zach}, year={0AD}}
-@misc{py7zr, title={Py7zr}, url={https://pypi.org/project/py7zr/}, journal={PyPI}, author={Miura, Hiroshi}, year={0AD}}
-@misc{requests, title={HTTP for Humans™}, url={https://docs.python-requests.org/en/master/index.html}, journal={Requests}, author={Reitz, Kenneth}, year={0AD}}
-@misc{beautifulsoup, title={Beautiful Soup documentation}, url={https://beautiful-soup-4.readthedocs.io/en/latest/}, journal={Beautiful Soup Documentation - Beautiful Soup 4.4.0 documentation}, author={Richardson, Leonard}, year={0AD}}
-@misc{flask, title={Flask}, url={https://pypi.org/project/Flask/}, journal={PyPI}, author={Ronacher, Armin}, year={0AD}}
-@misc{scipy, title={SciPy documentation}, url={https://scipy.github.io/devdocs/index.html}, journal={SciPy documentation - SciPy v1.9.0.dev0+1070.09b8d94 Manual}, author={The SciPy Community}, year={0AD}}
-@misc{twitter_friends, title={Get friends/list | docs | twitter developer platform}, url={https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list}, journal={Twitter}, publisher={Twitter}, author={Twitter}, year={0AD}}
-@misc{twitter_timeline, title={Get statuses/user\_timeline | docs | twitter developer platform}, url={https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline}, journal={Twitter}, publisher={Twitter}, author={Twitter}, year={0AD}}
-@misc{twitter_news, title={100 most Influential News Media Twitter Account Archives}, url={https://memeburn.com/motorburn/tag/100-most-influential-news-media-twitter-account/}, journal={Motorburn}, author={Bremmen, Nur}, year={2010}, month={Sep}}
-@misc{covid_cases, title={Nytimes/COVID-19-data: An ongoing repository of data on coronavirus cases and deaths in the U.S.}, url={https://github.com/nytimes/covid-19-data}, journal={GitHub}, author={Almukhtar, Sarah and Aufrichtig, Aliza and Barnard, Anne and Bloch, Matthew}, year={2021}}
-