[+] Test py

Add gridlines
Fix typo in report.py
2021-12-13 00:02:17 -05:00 · 2021-12-12 00:48:27 -05:00 · 2021-12-11 23:05:20 -05:00 · 2021-12-11 22:45:55 -05:00 · 2021-12-09 21:36:53 -05:00 · 2021-12-09 20:27:12 -05:00
94 changed files with 2631 additions and 232 deletions
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Latex intermediate files
+*.aux
+*.fdb_latexmk
+*.fls
+*.gz
+*.bbl
+*.blg
+*.bcf
+*.xml
+*.out
+
+config.json5
+data/
+/report/
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
@@ -0,0 +1,44 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="JavaDoc" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="TOP_LEVEL_CLASS_OPTIONS">
+        <value>
+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
+          <option name="REQUIRED_TAGS" value="" />
+        </value>
+      </option>
+      <option name="INNER_CLASS_OPTIONS">
+        <value>
+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
+          <option name="REQUIRED_TAGS" value="" />
+        </value>
+      </option>
+      <option name="METHOD_OPTIONS">
+        <value>
+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
+          <option name="REQUIRED_TAGS" value="@return@param@throws or @exception" />
+        </value>
+      </option>
+      <option name="FIELD_OPTIONS">
+        <value>
+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
+          <option name="REQUIRED_TAGS" value="" />
+        </value>
+      </option>
+      <option name="IGNORE_DEPRECATED" value="false" />
+      <option name="IGNORE_JAVADOC_PERIOD" value="true" />
+      <option name="IGNORE_DUPLICATED_THROWS" value="false" />
+      <option name="IGNORE_POINT_TO_ITSELF" value="false" />
+      <option name="myAdditionalJavadocTags" value="date" />
+    </inspection_tool>
+    <inspection_tool class="JpaDataSourceORMInspection" enabled="false" level="ERROR" enabled_by_default="false" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="bins" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/CSC110-Project.iml" filepath="$PROJECT_DIR$/CSC110-Project.iml" />
+    </modules>
+  </component>
+</project>
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RunConfigurationProducerService">
+    <option name="ignoredProducers">
+      <set>
+        <option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
+      </set>
+    </option>
+  </component>
+</project>
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
@@ -0,0 +1,12 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: Gui
+    given-names: Azalea
+    orcid: https://orcid.org/0000-0002-6141-5926
+  - family-names: Lin
+    given-names: Peter
+title: "COVID-19 Twitter Posting Frequency and Popularity Insights"
+version: 1.0.0
+doi: TODO
+date-released: TODO
@@ -1 +0,0 @@
-csc110.hydev.org
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/data" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="marked" level="application" />
+    <orderEntry type="library" name="jquery" level="application" />
+  </component>
+</module>
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Hykilpikonna
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,19 @@
+#!/usr/bin/env sh
+
+# abort on errors
+set -e
+
+# navigate into the build output directory
+cd dist
+
+# if you are deploying to a custom domain
+echo 'csc110.hydev.org' > CNAME
+
+git init
+git add -A
+git commit -m 'deploy'
+
+# if you are deploying to https://<USERNAME>.github.io/<REPO>
+git push -f git@github.com:Hykilpikonna/CSC110-Project.git master:gh-pages
+
+cd -
@@ -1,22 +0,0 @@
-| Username        | Frequency   |
-|-----------------|-------------|
-| Beyonce         | 50.0%       |
-| UNICEF          | 38.9%       |
-| WHO             | 31.3%       |
-| EmmaWatson      | 26.3%       |
-| BillGates       | 19.6%       |
-| UN              | 19.5%       |
-| SenWarren       | 18.4%       |
-| XHNews          | 14.7%       |
-| cnnbrk          | 14.3%       |
-| PDChina         | 13.7%       |
-| BBCBreaking     | 13.1%       |
-| BreakingNews    | 11.4%       |
-| AP              | 11.3%       |
-| Caradelevingne  | 11.1%       |
-| FLOTUS45        | 10.7%       |
-| sardesairajdeep | 10.7%       |
-| hazardeden10    | 10.5%       |
-| VP              | 10.4%       |
-| POTUS           | 10.0%       |
-| WSJ             | 10.0%       |
@@ -1,22 +0,0 @@
-| Username        | Frequency   |
-|-----------------|-------------|
-| JHUCAIH         | 54.8%       |
-| DrJudyMonroe    | 49.6%       |
-| PoltergeistTC   | 41.6%       |
-| _FatmaAhmed     | 31.6%       |
-| OUCitizenGovern | 28.6%       |
-| btolchin        | 27.0%       |
-| AusHCPNG        | 26.4%       |
-| UNECEHLM        | 23.6%       |
-| PIBFactCheck    | 20.6%       |
-| gospeakyourmind | 20.1%       |
-| RepHarley       | 20.0%       |
-| susancolehaley  | 19.2%       |
-| SNHDflu         | 18.4%       |
-| UrbanScholar1   | 18.2%       |
-| william_mcinnes | 18.2%       |
-| chrisfradkin    | 18.1%       |
-| USEmbassyBW     | 18.1%       |
-| carlosex        | 17.7%       |
-| tobiaskurth     | 17.4%       |
-| georgesoros     | 17.2%       |
@@ -1,5 +0,0 @@
-|                               |   `500-pop` |   `500-rand` |   `eng-news` |
-|-------------------------------|-------------|--------------|--------------|
-| Total users                   |         500 |          500 |          310 |
-| Users who didn't post at all  |         117 |          205 |           26 |
-| Users who posted less than 1% |         288 |          313 |           57 |
@@ -1,22 +0,0 @@
-| Username       | Frequency   |
-|----------------|-------------|
-| big_picture    | 77.8%       |
-| NBCNewsHealth  | 71.9%       |
-| Circa          | 50.0%       |
-| caitlinnowens  | 41.4%       |
-| msnbc_breaking | 40.0%       |
-| UnivisionNews  | 34.6%       |
-| firstdraftnews | 27.7%       |
-| NBCNewsNow     | 27.0%       |
-| itvpeston      | 26.9%       |
-| FaceTheNation  | 26.1%       |
-| telegraphnews  | 25.8%       |
-| LesterHoltNBC  | 23.4%       |
-| nytimesphoto   | 22.2%       |
-| boomlive_in    | 22.2%       |
-| mckquarterly   | 22.1%       |
-| straits_times  | 19.4%       |
-| cnbcevents     | 18.5%       |
-| TwitterMoments | 17.6%       |
-| sciam          | 17.5%       |
-| LiceMovono     | 17.3%       |
@@ -1,8 +0,0 @@
-|          | `500-pop`   | `500-rand`   | `eng-news`   |
-|----------|-------------|--------------|--------------|
-| Mean     | 3.0%        | 4.4%         | 8.6%         |
-| StdDev   | 4.9%        | 7.2%         | 8.9%         |
-| Median   | 1.3%        | 1.5%         | 7.6%         |
-| IQR      | 3.5%        | 4.3%         | 7.8%         |
-| Q1 (25%) | 0.4%        | 0.5%         | 3.1%         |
-| Q3 (75%) | 3.9%        | 4.8%         | 10.9%        |
@@ -1,22 +0,0 @@
-| Username        | Popularity Ratio   |
-|-----------------|--------------------|
-| juniorbachchan  | 3370.4%            |
-| Google          | 983.3%             |
-| JeremyClarkson  | 847.2%             |
-| Ibra_official   | 814.8%             |
-| SteveMartinToGo | 510.5%             |
-| khloekardashian | 453.6%             |
-| thetanmay       | 425.9%             |
-| Sethrogen       | 423.3%             |
-| Jacksepticeye   | 392.9%             |
-| JeffreeStar     | 386.7%             |
-| lukebryan       | 359.6%             |
-| BCCI            | 352.9%             |
-| 50cent          | 346.4%             |
-| RockstarGames   | 339.9%             |
-| mipaltan        | 331.7%             |
-| ashleytisdale   | 331.4%             |
-| jk_rowling      | 328.6%             |
-| Ninja           | 320.6%             |
-| AmazingPhil     | 313.3%             |
-| jamieoliver     | 311.1%             |
@@ -1,22 +0,0 @@
-| Username        | Popularity Ratio   |
-|-----------------|--------------------|
-| CarlosTF50      | 1868.2%            |
-| QueenHote1      | 1390.3%            |
-| L_Cook865       | 1220.1%            |
-| 1chiarajolie    | 1136.9%            |
-| jolun           | 1025.4%            |
-| Numb3z          | 970.7%             |
-| GEFURST         | 828.3%             |
-| AtkinsQC        | 814.6%             |
-| JZerucelli      | 753.3%             |
-| theAMshakeout   | 673.6%             |
-| shauna_louise0  | 614.5%             |
-| camillembaker   | 587.5%             |
-| HappyWarriorP   | 558.2%             |
-| LouisaJamesITV  | 556.8%             |
-| wyomingwormboy  | 544.0%             |
-| dafydd_llewelyn | 506.5%             |
-| _angemccormack  | 446.9%             |
-| DenchiSoft      | 441.4%             |
-| jennakubsnc     | 424.8%             |
-| damashreal      | 420.3%             |
@@ -1,22 +0,0 @@
-| Username       | Popularity Ratio   |
-|----------------|--------------------|
-| TwitterData    | 702.6%             |
-| empiremagazine | 371.4%             |
-| CNBCPolitics   | 370.1%             |
-| TB_Times       | 342.8%             |
-| instyle        | 342.7%             |
-| weatherchannel | 314.1%             |
-| NAHJ           | 295.8%             |
-| navikakumar    | 278.9%             |
-| Telegraph      | 277.4%             |
-| TwitterDC      | 269.1%             |
-| thedailybeast  | 256.7%             |
-| karaswisher    | 242.7%             |
-| dallasnews     | 240.2%             |
-| jbouie         | 233.1%             |
-| jonfortt       | 229.9%             |
-| DavidBegnaud   | 216.3%             |
-| NickKristof    | 213.9%             |
-| LiceMovono     | 204.3%             |
-| washingtonpost | 198.1%             |
-| VICENews       | 196.2%             |
@@ -1,3 +0,0 @@
-|         |   `500-pop` |   `500-rand` |   `eng-news` |
-|---------|-------------|--------------|--------------|
-| Ignored |         117 |          205 |           28 |
@@ -1,8 +0,0 @@
-|          |   `500-pop` |   `500-rand` |   `eng-news` |
-|----------|-------------|--------------|--------------|
-| Mean     |        1.08 |         1.48 |         1.02 |
-| StdDev   |        1.99 |         2.15 |         0.72 |
-| Median   |        0.73 |         0.93 |         0.88 |
-| IQR      |        0.74 |         1.13 |         0.6  |
-| Q1 (25%) |        0.41 |         0.42 |         0.62 |
-| Q3 (75%) |        1.15 |         1.55 |         1.22 |
@@ -1,8 +0,0 @@
-|          |   `500-pop` |   `500-rand` |   `eng-news` |
-|----------|-------------|--------------|--------------|
-| Mean     |        0.78 |         0.98 |         0.91 |
-| StdDev   |        0.52 |         0.8  |         0.46 |
-| Median   |        0.69 |         0.87 |         0.87 |
-| IQR      |        0.65 |         0.96 |         0.57 |
-| Q1 (25%) |        0.38 |         0.34 |         0.61 |
-| Q3 (75%) |        1.03 |         1.3  |         1.18 |
@@ -0,0 +1,36 @@
+
+####################
+# Data Collection
+# Json5 is a human-readable json format that allows for things such as unquoted keys or comments.
+json5~=0.9.6
+# Tweepy is a python SDK for twitter
+tweepy==4.4.0
+# requests is for getting html from a website URL
+requests==2.26.0
+# beautifulsoup is used to extract data from html
+beautifulsoup4==4.10.0
+
+#####################
+# Data Visualization
+# Print table data
+tabulate==0.8.9
+# Draw local graphs
+matplotlib==3.5.0
+# Calculate data statistics
+numpy==1.21.4
+# Date utility for manipulating dates
+python-dateutil~=2.8.2
+# Scipy for transforming data. We used it for IIR filtering.
+scipy~=1.7.3
+# For serving the report website
+flask==2.0.2
+
+####################
+# Data Packing
+# 7zip packing utility for packing our processed data
+py7zr==0.16.3
+
+#####################
+# Testing and code checking
+pytest
+python-ta
@@ -1,4 +0,0 @@
-|            |   Total |   English |   Chinese |   Japanese |
-|------------|---------|-----------|-----------|------------|
-| `500-pop`  |     500 |       495 |         0 |          5 |
-| `500-rand` |     500 |       393 |        15 |         92 |
@@ -0,0 +1,12 @@
+# Constants (The instructors said that we can use global constants here:
+# https://piazza.com/class/ksovzjrlsye72f?cid=1664
+# They should not end with "/"
+DATA_DIR = '../data'
+TWEETS_DIR = f'{DATA_DIR}/twitter/user-tweets'
+USER_DIR = f'{DATA_DIR}/twitter/user'
+REPORT_DIR = './report'
+
+# Debug mode, or developer mode. This affects two things:
+# 1. Whether debug messages are outputted
+# 2. Whether the web server regenerates the HTML page for every request
+DEBUG = True
@@ -0,0 +1,85 @@
+from tabulate import tabulate
+
+from process.twitter_process import *
+from process.twitter_visualization import *
+from raw_collect.twitter import *
+from report.report import serve_report
+from utils import *
+
+
+if __name__ == '__main__':
+    # Load config and create API
+    conf = load_config('config.json5')
+    api = tweepy_login(conf)
+
+    #####################
+    # Data collection - Step C1.1
+    # Download a wide range of users from Twitter using follow-chaining starting from a single user.
+    # (This task will never stop before it downloads every single user from twitter, so we need to
+    # manually stop it when there are enough users)
+    # download_users_start(api, 'voxdotcom')
+
+    # This task will run for a very very long time to obtain a large dataset of twitter users. If
+    # you want to stop the process, you can resume it later using the following line:
+    # download_users_resume_progress(api)
+
+    ####################
+    # Data collection - Step C1.2
+    # Download all tweets from TwitterNews
+    # download_all_tweets(api, 'TwitterNews')
+
+    #####################
+    # Data processing - Step P1
+    # (After step C1) Process the downloaded twitter users, extract screen name, popularity, and
+    # number of tweets data.
+    # process_users()
+
+    #####################
+    # Data processing - Step P2
+    # (After step P1) Select 500 most popular users and 500 random users who meet a particular
+    # criteria as our sample, also find news channels
+    # select_user_sample()
+
+    # Just curious, who are the 20 most popular individuals on twitter?
+    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
+    #                headers=['Name', 'Followers']))
+
+    #####################
+    # Data collection - Step C2.1
+    # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
+    # tweets from 500 of the most popular users. Takes around 2 hours.
+    # for u in load_user_sample().most_popular:
+    #     download_all_tweets(api, u.username)
+
+    #####################
+    # Data collection - Step C2.2
+    # (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
+    # for u in load_user_sample().random:
+    #     download_all_tweets(api, u.username)
+
+    #####################
+    # Data collection - Step C2.3
+    # (After step P2) Download all tweets from the news channels we selected.
+    # for u in load_user_sample().english_news:
+    #     download_all_tweets(api, u)
+    # Filter out news channels that have been blocked by twitter or don't exist anymore
+    # filter_news_channels()
+
+    #####################
+    # Data processing - Step P3
+    # (After step C2) Process the downloaded tweets, determine whether they are covid-related
+    # process_tweets()
+
+    ####################
+    # Data Visualization - Step V1
+    # Generate all visualization reports and graphs
+    report_all()
+
+    ####################
+    # Serve webpage
+    serve_report()
+
+    ####################
+    # Finalize the program for submission.
+    # Pack processed and unprocessed data:
+    # pack_data()
@@ -0,0 +1,321 @@
+"""
+Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
+users, creating samples of users, filtering news channels, and processing tweets for file storage.
+"""
+import random
+from typing import NamedTuple
+from dataclasses import dataclass
+
+import dateutil.parser
+import requests
+from bs4 import BeautifulSoup
+from py7zr import SevenZipFile
+
+from constants import DATA_DIR, TWEETS_DIR, USER_DIR
+from utils import *
+
+
+class ProcessedUser(NamedTuple):
+    """
+    User and popularity.
+
+    We use NamedTuple instead of dataclass because named tuples are easier to serialize in JSON and
+    they require much less space in the stored json format because no key info is stored. For
+    example, using dataclass, the json for one UserPopularity object will be:
+    {"username": "a", "popularity": 1, "num_postings": 1}, while using NamedTuple, the json will be:
+    ["a", 1, 1], which saves an entire 42 bytes for each user.
+    """
+    # Username
+    username: str
+    # A measurement of a user's popularity, such as followers count
+    popularity: int
+    # Number of tweets
+    num_postings: int
+    # Language
+    lang: str
+
+
+def process_users() -> None:
+    """
+    After downloading a wide range of users using download_users_start in raw_collect/twitter.py,
+    this function will read the user files, extract only relevant information defined in the
+    ProcessedUser class, and rank the users by popularity.
+
+    This function will save the processed user data to <user_dir>/processed/users.json
+
+    :return: None
+    """
+    users = []
+
+    # Loop through all the files
+    for filename in os.listdir(f'{USER_DIR}/users'):
+        # Only check json files and ignore macos dot files
+        if filename.endswith('.json') and not filename.startswith('.'):
+            # Read
+            user = json.loads(read(f'{USER_DIR}/users/{filename}'))
+
+            # Get user language (The problem is, most people's lang field are null, so we have to
+            # look at the language of their latest status as well, while they might not have a
+            # status field as well!)
+            lang = user['lang']
+            status_lang = user['status']['lang'] if 'status' in user else None
+            if lang is None:
+                lang = status_lang
+
+            users.append(ProcessedUser(user['screen_name'], user['followers_count'],
+                                       user['statuses_count'], lang))
+
+            # Log progress
+            if len(users) % 2000 == 0:
+                debug(f'Loaded {len(users)} users.')
+
+    # Sort by followers count, descending
+    users.sort(key=lambda x: x.popularity, reverse=True)
+
+    # Save data
+    write(f'{USER_DIR}/processed/users.json', json_stringify(users))
+
+
+def load_users() -> list[ProcessedUser]:
+    """
+    Load processed user data after process_users
+
+    :return: List of processed users, sorted descending by popularity.
+    """
+    return [ProcessedUser(*u) for u in json.loads(read(f'{USER_DIR}/processed/users.json'))]
+
+
+def get_user_popularity_ranking(user: str) -> int:
+    """
+    Get a user's popularity ranking. This is not used in data analysis, just for curiosity.
+
+    :param user: Username
+    :return: User's popularity ranking
+    """
+    pop = load_users()
+    for i in range(len(pop)):
+        if pop[i].username == user:
+            return i + 1
+    return -1
+
+
+@dataclass()
+class UserSample:
+    """
+    This is a data class storing our different samples.
+    """
+    most_popular: list[ProcessedUser]
+    random: list[ProcessedUser]
+    english_news: list[str]
+
+
+def select_user_sample() -> None:
+    """
+    Select our sample of the 500 most popular users and 500 random users who meet the criteria. The
+    criteria we use is that the user must have at least 150 followers, and must have a number of
+    postings in between 1000 and 3250. Analyzing someone who don't post or someone who doesn't have
+    enough followers for interaction might not reveal useful information. We also filter based on
+    language, because we only know how to identify COVID-related posts in a few languages.
+
+    The result will be stored in <user_dir>/processed/sample.json
+
+    :return: None
+    """
+    file = f'{USER_DIR}/processed/sample.json'
+
+    # Exists
+    if os.path.isfile(file):
+        debug(f'There is already a sample generated at {file}. If you want to reselect the'
+              f'sample, please delete the existing sample file.')
+        return
+
+    # Load users
+    users = load_users()
+
+    # Filter by language first
+    users = [u for u in users if u.lang is not None and
+             any(lang in u.lang for lang in {'en', 'zh', 'ja'})]
+
+    # Find most popular, and exclude them from the random sample
+    most_popular = users[:500]
+    users = users[500:]
+
+    # Filter by criteria
+    filtered = {u for u in users if 150 < u.popularity and 1000 < u.num_postings < 3250}
+    debug(f'There are {len(filtered)} users who meets the criteria.')
+
+    # Sample
+    sample = random.sample(filtered, 500)
+
+    # Save
+    write(file, json_stringify(UserSample(most_popular, sample, get_english_news_channels())))
+
+
+def get_english_news_channels() -> list[str]:
+    """
+    Find news channels that post in English from retweets of TwitterNews, combined with an
+    established list of 100 most influential news channels reported by Nur Bermmen from memeburn.com
+
+    Run this after download_all_tweets(api, 'TwitterNews')
+
+    Precondition:
+      - <tweets_dir>/user/TwitterNews.json exists.
+
+    :return: A list of news channel screen names
+    """
+    # Find news channels in retweets from TwitterNews
+    news_channels = {'TwitterNews'}
+    for tweet in json.loads(read(f'{TWEETS_DIR}/user/TwitterNews.json')):
+        text: str = tweet['full_text']
+        if text.startswith('RT @'):
+            user = text[4:].split(':')[0]
+            news_channels.add(user)
+
+    # Find news channels from top 100 list on memeburn.com
+    url = 'https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/'
+    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
+    users = {h.text[1:] for h in soup.select('table tr td:nth-child(2) > a')}
+
+    # Combine two sets, ignoring case (since the ids in the 100 list are all lowercased)
+    news_channels_lower = {n.lower() for n in news_channels}
+    for u in users:
+        if u not in news_channels_lower:
+            news_channels.add(u)
+
+    return list(news_channels)
+
+
+def filter_news_channels() -> None:
+    """
+    Filter out news channels that don't exist anymore or have been banned by Twitter.
+
+    Precondition:
+      - Run this after downloading all tweets from the news channels in Step 2.3 in main.
+
+    :return: None
+    """
+    sample = load_user_sample()
+    for u in list(sample.english_news):
+        u = u.lower()
+        if not (os.path.isfile(f'{TWEETS_DIR}/processed/{u}.json')
+                or os.path.isfile(f'{TWEETS_DIR}/users/{u}.json')):
+            sample.english_news.remove(u)
+    write(f'{USER_DIR}/processed/sample.json', json_stringify(sample))
+
+
+def load_user_sample() -> UserSample:
+    """
+    Load the selected sample
+
+    :return: None
+    """
+    j = json.loads(read(f'{USER_DIR}/processed/sample.json'))
+    return UserSample([ProcessedUser(*u) for u in j['most_popular']],
+                      [ProcessedUser(*u) for u in j['random']],
+                      j['english_news'])
+
+
+class Posting(NamedTuple):
+    """
+    Posting data stores the processed tweets data, and it contains info such as whether or not a
+    tweet is covid-related
+    """
+    # Full text of the post's content
+    covid_related: bool
+    # Popularity of the post
+    popularity: int
+    # Is it a repost
+    repost: bool
+    # Date in ISO format
+    date: str
+
+
+def process_tweets() -> None:
+    """
+    Process tweets, reduce the tweets data to only a few fields defined in the Posting class. These
+    include whether or not the tweet is covid-related, how popular is the tweet, if it is a repost,
+    and its date. The processed tweet does not contain its content.
+
+    If a user's tweets is already processed, this function will skip over that user's data.
+
+    This function will save the processed tweets data to <tweets_dir>/processed/<username>.json
+
+    :return: None
+    """
+    # Loop through all the files
+    for filename in os.listdir(f'{TWEETS_DIR}/user'):
+        # Only check json files and ignore macos dot files
+        if filename.endswith('.json') and not filename.startswith('.'):
+            # Check if already processed
+            if os.path.isfile(f'{TWEETS_DIR}/processed/{filename}'):
+                continue
+
+            # Read
+            tweets = json.loads(read(f'{TWEETS_DIR}/user/{filename}'))
+            p = [Posting(is_covid_related(t['full_text']),
+                         t['favorite_count'] + t['retweet_count'],
+                         'retweeted_status' in t,
+                         datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
+                         .isoformat())
+                 for t in tweets]
+
+            # Save data
+            write(f'{TWEETS_DIR}/processed/{filename}', json_stringify(p))
+            debug(f'Processed: {filename}')
+
+
+def load_tweets(username: str) -> list[Posting]:
+    """
+    Load tweets for a specific user
+
+    :param username: User's screen name
+    :return: User's processed tweets
+    """
+    return [Posting(*p) for p in json.loads(read(
+        os.path.join(TWEETS_DIR, f'processed/{username}.json')))]
+
+
+def is_covid_related(text: str) -> bool:
+    """
+    Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
+    though we know that not all posts with covid-related words are covid-related posts, this is our
+    current best method of classification.
+
+    :param text: Text content
+    :return: Whether the text is covid related
+    """
+    # English
+    # We're hesitant to include words like "pandemic" or "vaccine" because they might refer to other
+    # pandemics or other vaccines. However, I think we need to include "the pandemic" because many
+    # posts refer to covid only as "the pandemic."
+    keywords = ['covid', 'the pandemic', 'lockdown', 'spikevax', 'comirnaty', 'vaxzevria',
+                'coronavirus', 'moderna', 'pfizer', 'quarantine', 'vaccine', 'social distancing',
+                'booster shot']
+
+    # Chinese
+    keywords += ['新冠', '疫情', '感染', '疫苗', '隔离']
+
+    # Japanese
+    keywords += ['コロナ', '検疫', '三密']
+
+    return any(k in text.lower() for k in keywords)
+
+
+def pack_data() -> None:
+    """
+    This function packs processed data and raw data separately.
+
+    :return: None
+    """
+    packed_dir = f'{DATA_DIR}/packed'
+    Path(packed_dir).mkdir(parents=True, exist_ok=True)
+
+    # Pack data for processed.
+    debug('Packing data...')
+    processed_dirs = ['/twitter/user/meta', '/twitter/user/processed',
+                      '/twitter/user-tweets/processed']
+    with SevenZipFile(f'{packed_dir}/processed.7z', 'w') as z:
+        z: SevenZipFile = z
+        for p in processed_dirs:
+            debug(f'- Packing {p}')
+            z.writeall(DATA_DIR + p)
@@ -0,0 +1,541 @@
+"""
+This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
+The graphs are created after processing the data, for example with filtering and removing outliers.
+"""
+from datetime import timedelta
+from dataclasses import dataclass, field
+from typing import Optional
+
+import matplotlib.ticker
+import numpy as np
+import requests
+import scipy.signal
+from matplotlib import pyplot as plt, font_manager
+import matplotlib.dates as mdates
+from matplotlib import cm
+
+from process.twitter_process import *
+from raw_collect.others import get_covid_cases_us
+
+
+@dataclass()
+class UserFloat:
+    """
+    Model for which a floating point data is assigned to each user
+
+    This is used for both COVID tweet frequency and popularity ratio data, because both of these
+    are floating point data.
+    """
+    name: str
+    data: float
+
+
+class Sample:
+    name: str
+    users: list[str]
+    # Total frequencies of all posts for each user across all dates (sorted)
+    user_freqs: list[UserFloat]
+    # Total popularity ratios of all posts for each user across all dates (sorted)
+    user_pops: list[UserFloat]
+    # Average popularity of all u's posts
+    user_all_pop_avg: dict[str, float]
+    # Average popularity of COVID tweets by a specific user on a specific date
+    # user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date}
+    user_date_covid_pop_avg: dict[str, dict[str, float]]
+    # Total COVID-tweets frequency on a specific date for all users.
+    date_covid_freq: dict[str, float]
+    # dates[i] = The i-th day since the first tweet
+    dates: list[datetime]
+    # date_freqs[i] = COVID frequency of all posts from all users in this sample on date[i]
+    date_freqs: list[float]
+    # date_pops[i] = Average popularity ratio of all posts from all users in this sample on date[i]
+    date_pops: list[float]
+
+    def __init__(self, name: str, users: list[str]):
+        self.name = name
+        self.users = users
+        self.calculate_sample_data()
+        self.calculate_change_data()
+
+    def calculate_sample_data(self) -> None:
+        """
+        This function loads and calculates the frequency that a list of user posts about COVID, and
+        also calculates their relative popularity of COVID posts.
+
+        This function also creates a combined list of all users in a sample.
+
+        Frequency: the frequency that the sampled users post about COVID. For example, someone who
+        posted every single tweet about COVID will have a frequency of 1, and someone who doesn't
+        post about COVID will have a frequency of 0.
+
+        Popularity ratio: the relative popularity of the sampled users' posts about COVID. If one
+        person posted a COVID post and got 1000 likes, while their other posts (including this
+        one) got an average of 1 like, they will have a relative popularity of 1000. If,
+        on the other hand, one person posted a COVID post and got 1 like, while their other posts
+        (including this one) got an average of 1000 likes, they will have a relative popularity
+        of 1/1000.
+
+        To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't
+        post at all.
+
+        Precondition:
+          - Downloaded tweets data are sorted by date
+        """
+        debug(f'Calculating sample tweets data for {self.name}...')
+        popularity = []
+        frequency = []
+        date_covid_count = dict()
+        date_all_count = dict()
+        self.user_all_pop_avg = dict()
+        self.user_date_covid_pop_avg = dict()
+        for i in range(len(self.users)):
+            u = self.users[i]
+
+            # Show progress
+            if i != 0 and i % 100 == 0:
+                debug(f'- Calculated {i} users.')
+
+            # Load processed tweet
+            tweets = load_tweets(u)
+            # Ignore retweets, and ignore tweets that are earlier than the start of COVID
+            tweets = [t for t in tweets if not t.repost and t.date > '2020-01-01T01:01:01']
+            # Filter covid tweets
+            covid = [t for t in tweets if t.covid_related]
+
+            # To prevent divide by zero, ignore people who didn't post at all
+            if len(tweets) == 0:
+                frequency.append(UserFloat(u, 0))
+                continue
+            # Calculate the frequency of COVID-related tweets
+            freq = len(covid) / len(tweets)
+            frequency.append(UserFloat(u, freq))
+
+            # Calculate date fields
+            # Assume tweets are sorted
+            # tweets.sort(key=lambda x: x.date)
+            # Calculate popularity by date
+            date_cp_sum = dict()
+            date_cp_count = dict()
+            for t in tweets:
+                d = t.date[:10]
+
+                # For covid popularity on date
+                if t.covid_related:
+                    if d not in date_cp_sum:
+                        date_cp_sum[d] = 0
+                        date_cp_count[d] = 0
+                    date_cp_sum[d] += t.popularity
+                    date_cp_count[d] += 1
+
+                # For frequency on date
+                if d not in date_covid_count:
+                    date_covid_count[d] = 0
+                    date_all_count[d] = 0
+                if t.covid_related:
+                    date_covid_count[d] += 1
+                date_all_count[d] += 1
+
+            self.user_date_covid_pop_avg[u] = \
+                {d: date_cp_sum[d] / date_cp_count[d] for d in date_cp_sum}
+
+            # Calculate total popularity ratio for a user
+            # To prevent divide by zero, ignore everyone who didn't post about covid
+            if len(covid) == 0:
+                continue
+            # Get the average popularity for COVID-related tweets
+            covid_pop_avg = sum(t.popularity for t in covid) / len(covid)
+            all_pop_avg = sum(t.popularity for t in tweets) / len(tweets)
+            # Save global_avg
+            self.user_all_pop_avg[u] = all_pop_avg
+            # To prevent divide by zero, ignore everyone who literally have no likes on any post
+            if all_pop_avg == 0:
+                continue
+            # Get the relative popularity
+            popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
+
+        # Calculate frequency on date
+        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
+
+        # Sort by relative popularity or frequency
+        popularity.sort(key=lambda x: x.data, reverse=True)
+        frequency.sort(key=lambda x: x.data, reverse=True)
+
+        # Assign to sample
+        self.user_freqs = frequency
+        self.user_pops = popularity
+        debug('- Done.')
+
+    def calculate_change_data(self) -> None:
+        """
+        This function calculates self.date_freqs and self.date_pops, which are lists that stores the
+        frequencies and popularity ratios on each date since the first tweet. This calculation
+        ignores users, but instead combines the tweets of the entire sample in the calculation.
+
+        More details about the calculations can be found in the report, or report_document.md
+
+        Preconditions:
+          - len(self.tweets) > 0
+          - self.tweets != None
+
+        :return: None
+        """
+        self.dates = []
+        self.date_pops = []
+
+        # Average popularity ratio results over 7 days
+        seven_days_user_prs = []
+
+        # Loop through all dates from the start of COVID to when the data is obtained
+        for (ds, dt) in daterange('2020-01-01', '2021-11-25'):
+            self.dates.append(dt)
+
+            # Calculate date covid popularity ratio
+            users_posted_today = [u for u in self.users if u in self.user_date_covid_pop_avg and
+                                  ds in self.user_date_covid_pop_avg[u]]
+            if len(users_posted_today) == 0:
+                seven_days_user_prs.append([])
+            else:
+                user_prs = [self.user_date_covid_pop_avg[u][ds] / self.user_all_pop_avg[u]
+                            for u in users_posted_today if self.user_all_pop_avg[u] != 0]
+                seven_days_user_prs.append(user_prs)
+
+            # Average over seven days
+            seven_days_count = sum(len(user_prs) for user_prs in seven_days_user_prs)
+            if seven_days_count == 0:
+                pops_i = 1
+            else:
+                user_pop_ratio_sum = sum(sum(user_prs) for user_prs in seven_days_user_prs)
+                pops_i = user_pop_ratio_sum / seven_days_count
+
+            # More than seven days, remove one
+            if len(seven_days_user_prs) > 7:
+                seven_days_user_prs.pop(0)
+
+            self.date_pops.append(pops_i)
+
+        # Date frequencies
+        self.date_freqs = map_to_dates(self.date_covid_freq,
+                                       [x.isoformat()[:10] for x in self.dates])
+        self.date_freqs = filter_days_avg(self.date_freqs, 3)
+
+
+def load_samples() -> list[Sample]:
+    """
+    Load samples, and report demographics
+
+    :return: Samples
+    """
+    # Load sample, convert format
+    users = load_user_sample()
+    samples = [Sample('500-pop', [u.username for u in users.most_popular]),
+               Sample('500-rand', [u.username for u in users.random]),
+               Sample('eng-news', list(users.english_news))]
+
+    # Report demographics
+    keys = ['en', 'zh', 'ja']
+    pop_lang = [u.lang for u in users.most_popular]
+    rand_lang = [u.lang for u in users.random]
+    Reporter('sample-demographics.md')\
+        .table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
+                ['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
+               ['Total', 'English', 'Chinese', 'Japanese'], False)
+
+    return samples
+
+
+def report_top_20_tables(sample: Sample) -> None:
+    """
+    Get top-20 most frequent or most relatively popular users and store them in a table.
+
+    :param sample: Sample
+    :return: None
+    """
+    Reporter(f'freq/{sample.name}-top-20.md').table(
+        [[u.name, f'{u.data * 100:.1f}%'] for u in sample.user_freqs[:20]],
+        ['Username', 'Frequency'])
+
+    Reporter(f'pop/{sample.name}-top-20.md').table(
+        [[u.name, f'{u.data * 100:.1f}%'] for u in sample.user_pops[:20]],
+        ['Username', 'Popularity Ratio'])
+
+
+def report_ignored(samples: list[Sample]) -> None:
+    """
+    Report how many people didn't post about covid or posted less than 1% about COVID across
+    different samples.
+
+    And for popularity ratios, report how many people are ignored because they didn't post.
+
+    :param samples: Samples
+    :return: None
+    """
+    # For frequencies, report who didn't post
+    table = [["Total users"] + [str(len(s.users)) for s in samples],
+             ["Users who didn't post at all"] +
+             [str(len([1 for a in s.user_freqs if a.data == 0])) for s in samples],
+             ["Users who posted less than 1%"] +
+             [str(len([1 for a in s.user_freqs if a.data < 0.01])) for s in samples]]
+
+    Reporter('freq/didnt-post.md').table(table, [s.name for s in samples], True)
+
+    # For popularity ratio, report ignored
+    table = [["Ignored"] + [str(len(s.users) - len(s.user_pops)) for s in samples]]
+    Reporter('pop/ignored.md').table(table, [s.name for s in samples], True)
+
+
+def graph_load_font() -> None:
+    """
+    Load iosevka font for matplotlib
+    """
+    font = Path(os.path.realpath(__file__)).absolute().parent.joinpath('iosevka-ss04-regular.ttf')
+    fe = font_manager.FontEntry(font, 'iosevka')
+    font_manager.fontManager.ttflist.insert(0, fe)
+    plt.rcParams["font.family"] = "iosevka"
+
+
+def graph_histogram(x: list[float], path: str, title: str, freq: bool, clear_outliers: bool = False,
+                    bins: int = 20) -> None:
+    """
+    Plot a histogram
+
+    :param x: X axis data
+    :param path: Output image path (should end in .png)
+    :param title: Title
+    :param freq: Whether we are graphing frequencies data instead of popularity ratios
+    :param clear_outliers: Remove outliers or not
+    :param bins: Number of bins
+    :return: None
+    """
+    if clear_outliers:
+        title = title + ' - No Outliers'
+        x = remove_outliers(x)
+
+    border_color = '#5b3300'
+
+    # Create fig ax
+    fig: plt.Figure
+    ax: plt.Axes
+    fig, ax = plt.subplots()
+    ax.margins(x=0, y=0)
+
+    # Plot
+    ax.set_title(title, color=border_color)
+    ax.hist(x, bins=bins, color='#ffcccc')
+
+    if freq:
+        ax.xaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(1))
+    else:
+        ax.axvline(1, color='#DACAA9')
+
+    # Colors
+    ax.tick_params(color=border_color, labelcolor=border_color)
+    for spine in ax.spines.values():
+        spine.set_edgecolor(border_color)
+
+    # Grid
+    ax.grid(visible=True, axis='both')
+
+    # Save
+    fig.savefig(os.path.join(REPORT_DIR, path))
+    fig.clf()
+    plt.close(fig)
+
+
+def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]], path: str,
+                    title: str, freq: bool, n: int = 0, labels: Optional[list[str]] = None) -> None:
+    """
+    Plot a line plot, and reduce noise using an IIR filter
+
+    :param x: X axis data
+    :param y: Y axis data (or Y axis data lines)
+    :param n: IIR filter parameter (Ignored if n <= 0)
+    :param path: Output image path (should end in .png)
+    :param freq: Whether you are graphing frequencies data instead of popularity ratios
+    :param title: Title
+    :param labels: Labels or none
+    :return: None
+    """
+    # Filter
+    if n > 0:
+        b = [1.0 / n] * n
+        a = 1
+        y = scipy.signal.lfilter(b, a, y)
+
+    border_color = '#5b3300'
+
+    # Create fig ax
+    fig: plt.Figure
+    ax: plt.Axes
+    fig, ax = plt.subplots()
+    ax.margins(x=0, y=0)
+
+    # Date format
+    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m\n%Y'))
+    ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=1))
+    ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
+
+    if freq:
+        # Y axis percent format
+        ax.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(1))
+
+    # Plot
+    ax.set_title(title, color=border_color)
+
+    # Plotting single data line
+    if isinstance(y[0], float):
+        ax.plot(x, y, color='#d4b595')
+
+        if freq:
+            # Color below curve
+            ax.fill_between(x, y, color='#d4b595')
+
+        else:
+            ax.axhline(1, color=border_color)
+            ax.set_ylim(0, 2)
+
+    # Plotting multiple data lines
+    else:
+        fig.set_size_inches(16, 9)
+        plt.tight_layout()
+        for i in range(len(y)):
+            line, = ax.plot(x, y[i])
+            if len(labels) > i:
+                line.set_label(labels[i])
+                ax.legend()
+
+        # Plotting frequency, add in the COVID cases data
+        if freq:
+            cases = get_covid_cases_us()
+            c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
+            # c = scipy.signal.savgol_filter(c, 45, 2)
+            c = filter_days_avg(c, 7)
+            c = scipy.signal.lfilter([1.0 / n] * n, 1, c)
+
+            twin: plt.Axes = ax.twinx()
+            twin.plot(x, c, color='#d4b595', label='US COVID-19 Cases')
+            twin.set_ylim(bottom=0)
+
+        # Plotting popularity
+        else:
+            ax.axhline(1, color=border_color)
+            ax.set_ylim(0, 2)
+
+    # Colors
+    ax.tick_params(color=border_color, labelcolor=border_color)
+    ax.tick_params(which='minor', colors='#e1ad6b', labelcolor='#e1ad6b')
+    for spine in ax.spines.values():
+        spine.set_edgecolor(border_color)
+
+    # Grid
+    ax.grid(visible=True, axis='both')
+
+    # Save
+    path = Path(os.path.join(REPORT_DIR, path))
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(str(path))
+    fig.clf()
+    plt.close(fig)
+
+
+def report_histograms(sample: Sample) -> None:
+    """
+    Report histograms of COVID posting frequencies and popularity ratios
+
+    :param sample: Sample
+    :return: None
+    """
+    x = [f.data for f in sample.user_freqs]
+    title = f'COVID-related posting frequency for {sample.name}'
+    graph_histogram(x, f'freq/{sample.name}-hist-outliers.png', title, True, False, 100)
+    x = [p for p in x if p > 0.001]
+    graph_histogram(x, f'freq/{sample.name}-hist.png', title, True, True)
+
+    x = [f.data for f in sample.user_pops]
+    title = f'Popularity ratio of COVID posts for {sample.name}'
+    graph_histogram(x, f'pop/{sample.name}-hist.png', title, False, True)
+
+
+def report_stats(samples: list[Sample]) -> None:
+    """
+    Report frequencies and popularity ratios' statistics
+
+    :param samples: Samples
+    :return: None
+    """
+    xs = [[d.data for d in s.user_pops] for s in samples]
+
+    table = tabulate_stats([get_statistics(x) for x in xs])
+    Reporter('pop/stats-with-outliers.md').table(table, [s.name for s in samples], True)
+
+    table = tabulate_stats([get_statistics(remove_outliers(x)) for x in xs])
+    Reporter('pop/stats.md').table(table, [s.name for s in samples], True)
+
+    xs = [[d.data for d in s.user_freqs if d.data > 0.0005] for s in samples]
+    table = tabulate_stats([get_statistics(x) for x in xs], percent=True)
+    Reporter('freq/stats.md').table(table, [s.name for s in samples], True)
+
+
+def report_change_different_n(sample: Sample) -> None:
+    """
+    Experiment wth different n values for IIR filter
+
+    :param sample: Sample
+    :return: None
+    """
+    for n in range(5, 16, 5):
+        graph_line_plot(sample.dates, sample.date_pops, f'change/n/{n}.png',
+                        f'COVID-posting popularity ratio over time for {sample.name} IIR(n={n})',
+                        False, n)
+
+
+def report_change_graphs(sample: Sample) -> None:
+    graph_line_plot(sample.dates, sample.date_pops, f'change/pop/{sample.name}.png',
+                    f'COVID-posting popularity ratio over time for {sample.name} IIR(10)',
+                    False, 10)
+    graph_line_plot(sample.dates, sample.date_freqs, f'change/freq/{sample.name}.png',
+                    f'COVID-posting frequency over time for {sample.name} IIR(10)',
+                    True, 10)
+
+
+def report_all() -> None:
+    """
+    Generate all reports
+    """
+    graph_load_font()
+
+    Path(f'{REPORT_DIR}/freq').mkdir(parents=True, exist_ok=True)
+    Path(f'{REPORT_DIR}/pop').mkdir(parents=True, exist_ok=True)
+
+    debug('Loading samples...')
+    samples = load_samples()
+
+    print()
+    debug('Creating reports...')
+
+    report_ignored(samples)
+    report_stats(samples)
+    for s in samples:
+        report_top_20_tables(s)
+        report_histograms(s)
+        report_change_graphs(s)
+    report_change_different_n(samples[0])
+
+    graph_line_plot(samples[0].dates, [s.date_pops for s in samples], 'change/comb/pop.png',
+                    'COVID-posting popularity ratio over time for all samples - IIR(10)', False, 10,
+                    labels=[s.name for s in samples])
+    graph_line_plot(samples[0].dates, [s.date_freqs for s in samples], 'change/comb/freq.png',
+                    'COVID-posting frequency over time for all samples - IIR(10)', True, 10,
+                    labels=[s.name for s in samples])
+
+
+if __name__ == '__main__':
+    report_all()
+    # samples = load_user_sample()
+    # combine_tweets_for_sample([u.username for u in samples.most_popular], '500-pop')
+    # combine_tweets_for_sample([u.username for u in samples.random], '500-rand')
+    # combine_tweets_for_sample(samples.english_news, 'eng-news')
+
+    # tweets = load_combined_tweets('500-pop')
+    # print(len(tweets))
+    # view_covid_tweets_date(tweets)
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+import requests
+
+
+@dataclass
+class CasesData:
+    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+    cases: dict[str, float]
+    deaths: dict[str, float]
+
+
+def get_covid_cases_us() -> CasesData:
+    """
+    Get the US COVID-19 cases data from https://github.com/nytimes/covid-19-data by New York Times
+
+    :return: Cases data
+    """
+    url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us.csv'
+    csv = requests.get(url).text.replace('\r\n', '\n').split('\n')[1:]
+    data = CasesData(dict(), dict())
+
+    # Parse CSV
+    for line in csv:
+        split = line.split(',')
+        day, cases, deaths = split[0], split[2], split[6]
+        data.cases[day] = float(cases)
+        data.deaths[day] = float(deaths)
+    return data
@@ -0,0 +1,308 @@
+"""
+This module interacts directly with the Twitter API to download tweets and users.
+It contains functions related scraping users/tweets, including:
+- getting the tweets of a user
+- downloading many users by checking their followers and follower's followers, etc.
+"""
+import math
+import random
+import time
+from typing import List
+
+import tweepy
+from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound
+
+from constants import TWEETS_DIR, USER_DIR
+from utils import *
+
+
+def tweepy_login(conf: Config) -> tweepy.API:
+    """
+    Login to tweepy
+
+    :param conf: Config from load_config()
+    :return: Tweepy API object
+    """
+    auth = tweepy.OAuthHandler(conf.consumer_key, conf.consumer_secret)
+    auth.set_access_token(conf.access_token, conf.access_secret)
+    api: tweepy.API = tweepy.API(auth)
+    return api
+
+
+def get_tweets(api: API, name: str, rate_delay: float, max_id: Union[int, None]) -> List[Tweet]:
+    """
+    Get tweets and wait for delay
+
+    :param api: Tweepy API object
+    :param name: Screen name
+    :param rate_delay: Seconds of delay per request
+    :param max_id: Max id of the tweet or none
+    :return: Tweets list
+    """
+    tweets = api.user_timeline(screen_name=name, count=200, tweet_mode='extended', trim_user=True,
+                               max_id=max_id)
+    time.sleep(rate_delay)
+    return tweets
+
+
+def download_all_tweets(api: API, screen_name: str,
+                        download_if_exists: bool = False) -> None:
+    """
+    Download all tweets from a specific individual to a local folder.
+
+    Data Directory
+    --------
+    It will download all tweets to ./data/twitter/user-tweets/user/<screen_name>.json
+
+    Twitter API Reference
+    --------
+    It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
+    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
+    This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
+    limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
+    safe, this function uses a rate limit of 60 rpm.
+
+    :param api: Tweepy API object
+    :param screen_name: Screen name of that individual
+    :param download_if_exists: Whether or not to download if it already exists (Default: False)
+    :return: None
+    """
+    # Ensure directories exist
+    file = f'{TWEETS_DIR}/user/{screen_name}.json'
+
+    # Check if user already exists
+    if os.path.isfile(file):
+        if download_if_exists:
+            debug(f'!!! User tweets data for {screen_name} already exists, but overwriting.')
+        else:
+            debug(f'User tweets data for {screen_name} already exists, skipping.')
+            return
+
+    debug(f'Downloading user tweets for {screen_name}')
+
+    # Rate limit for this endpoint is 60 rpm for user auth and 69.44 rpm for app auth.
+    rate_delay = calculate_rate_delay(60)
+
+    # Get initial 200 tweets
+    try:
+        tweets = get_tweets(api, screen_name, rate_delay, None)
+    except Unauthorized:
+        debug(f'- {screen_name}: Unauthorized. Probably a private account, ignoring.')
+        return
+    except NotFound:
+        debug(f'- {screen_name}: Not found. Probably a deleted account, ignoring.')
+        return
+
+    # This person has no tweets, done. (By the way, we discovered that @lorde has no tweets but has
+    # 7 million followers... wow!)
+    if len(tweets) == 0:
+        write(file, '[]')
+        return
+
+    # Get additional tweets
+    while True:
+        # Try to get more tweets
+        debug(f'- {screen_name}: {len(tweets)} tweets...')
+        additional_tweets = get_tweets(api, screen_name, rate_delay, int(tweets[-1].id_str) - 1)
+
+        # No more tweets
+        if len(additional_tweets) == 0:
+            debug(f'- {screen_name}: {len(tweets)} tweets, no more tweets are available.\n')
+            break
+
+        # Add tweets to the list
+        tweets.extend(additional_tweets)
+
+    # Store in file
+    # Even though we are not supposed to use internal fields, there aren't any efficient way of
+    # obtaining the json without the field. Using t.__dict__ will include the API object, which
+    # is not serializable.
+    write(file, json_stringify([t._json for t in tweets]))
+
+
+def download_users_start(api: API, start_point: str, n: float = math.inf) -> None:
+    """
+    This function downloads n twitter users by using a friends-chain.
+
+    Since there isn't an API or a database with all twitter users, we can't obtain a strict list
+    of all twitter users, nor can we obtain a list of strictly random or most popular twitter
+    users. Therefore, we use the method of follows chaining: we start from a specific individual,
+    obtain their followers, and pick 6 random individuals from the friends list. Then, we repeat
+    the process for the selected friends: we pick 6 random friends of the 6 random friends
+    that we picked.
+
+    In reality, this method will be biased toward individuals that are worthy of following since
+    "friends" are the list of users that someone followed.
+
+    Data Directory
+    --------
+    It will download all user data to ./data/twitter/user/users/<screen_name>.json
+    It will save meta info to ./data/twitter/user/meta/
+
+    Twitter API Reference
+    --------
+    It will be using the API endpoint api.twitter.com/friends/list (Documentation:
+    https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list)
+    This will limit the rate of requests to 15 requests in a 15-minute window, which is one request
+    per minute. But it is actually the fastest method of downloading a wide range of users on
+    twitter because it can download a maximum of 200 users at a time while the API for downloading
+    a single user is limited to only 900 queries per 15, which is only 60 users per minute.
+
+    There is another API endpoint that might do the job, which is api.twitter.com/friends/ids (Doc:
+    https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-ids)
+    However, even though this endpoint has a much higher request rate limit, it only returns user
+    ids and not full user info.
+
+    Parameters
+    --------
+    :param api: Tweepy's API object
+    :param start_point: Starting user's screen name.
+    :param n: How many users do you want to download? (Default: math.inf)
+    :return: None
+    """
+
+    # Set of all the downloaded users' screen names
+    downloaded = set()
+
+    # The set of starting users that are queried.
+    done_set = set()
+
+    # The set of starting users currently looping through
+    current_set = {start_point}
+
+    # The next set of starting users
+    next_set = set()
+
+    # Start download
+    download_users_execute(api, n, downloaded,
+                           done_set, current_set, next_set)
+
+
+def download_users_resume_progress(api: API) -> None:
+    """
+    Resume from started progress
+
+    :param api: Tweepy's API object
+    :return: None
+    """
+    # Open file and read
+    meta = json.loads(read(f'{USER_DIR}/meta/meta.json'))
+
+    # Resume
+    download_users_execute(api, meta['n'],
+                           set(meta['downloaded']), set(meta['done_set']),
+                           set(meta['current_set']), set(meta['next_set']))
+
+
+def download_users_execute(api: API, n: float,
+                           downloaded: set[str], done_set: set[str],
+                           current_set: set[str], next_set: set[str]) -> None:
+    """
+    Execute download from the given parameters. The download method is defined in the document for
+    the download_users function.
+
+    Resume functionality is necessary because twitter limits the rate of get friends list to 15
+    requests in a 15-minute window, which is 1 request per minute, so it will take a long time to
+    gather enough data, so we don't want to have to start over from the beginning once something
+    goes wrong.
+
+    :param api: Tweepy's API object
+    :param n: How many users do you want to download?
+    :param downloaded: Set of all the downloaded users' screen names
+    :param done_set: The set of starting users that are queried
+    :param current_set: The set of starting users currently looping through
+    :param next_set: The next set of starting users
+    :return: None
+    """
+    # Rate limit for this API endpoint is 1 request per minute, and rate delay defines how many
+    # seconds to sleep for each request.
+    rate_delay = calculate_rate_delay(1) + 1
+
+    print("Executing friends-chain download:")
+    print(f"- n: {n}")
+    print(f"- Requests per minute: 1")
+    print(f"- Directory: {USER_DIR}")
+    print(f"- Downloaded: {len(downloaded)}")
+    print(f"- Current search set: {len(current_set)}")
+    print(f"- Next search set: {len(next_set)}")
+    print()
+
+    # Loop until there are enough users
+    while len(downloaded) < n:
+        # Take a screen name from the current list
+        screen_name = current_set.pop()
+
+        try:
+            # Get a list of friends.
+            friends: List[User] = api.get_friends(screen_name=screen_name, count=200)
+        except TooManyRequests:
+            # Rate limited, sleep and try again
+            debug('Caught TooManyRequests exception: Rate limited, sleep and try again.')
+            time.sleep(rate_delay)
+            current_set.add(screen_name)
+            continue
+
+        # Save users
+        for user in friends:
+            # This user was not saved, save the user.
+            if user not in downloaded:
+                # Save user json
+                write(f'{USER_DIR}/users/{user.screen_name}.json', json_stringify(user._json))
+
+                # Add to set
+                downloaded.add(user.screen_name)
+                # debug(f'- Downloaded {user.screen_name}')
+
+        # Get users and their popularity that we haven't downloaded
+        screen_names = [(u.screen_name, u.followers_count) for u in friends
+                        if u.screen_name not in done_set and not u.protected]
+
+        # Sort by followers count, from least popular to most popular
+        screen_names.sort(key=lambda x: x[1])
+
+        # Add 3 random users to the next set
+        if len(screen_names) > 3:
+            samples = {u[0] for u in random.sample(screen_names, 3)}
+        else:
+            samples = {u[0] for u in screen_names}
+
+        # Add 3 most popular users that we haven't downloaded to the next set
+        while len(screen_names) > 0 and len(samples) < 6:
+            most_popular = screen_names.pop()[0]
+            if most_popular not in done_set and most_popular not in samples:
+                samples.add(most_popular)
+
+        # Add the selected users to the next set
+        for s in samples:
+            next_set.add(s)
+
+        # Change name lists
+        if len(current_set) == 0:
+            current_set = next_set
+            next_set = set()
+
+        # This one is done
+        done_set.add(screen_name)
+
+        # Update meta info so that downloading can be continued
+        meta = {'downloaded': downloaded, 'done_set': done_set,
+                'current_set': current_set, 'next_set': next_set, 'n': n}
+        write(f'{USER_DIR}/meta/meta.json', json_stringify(meta))
+
+        debug(f'Finished saving friends of {screen_name}')
+        debug(f'============= Total {len(downloaded)} saved =============')
+
+        # Rate limit
+        time.sleep(rate_delay)
+
+
+if __name__ == '__main__':
+    # python_ta.check_all(config={
+    #     'max-line-length': 100,
+    #     'disable': ['R1705', 'C0200', 'E9998', 'E9999']
+    # })
+
+    config = load_config('config.json5')
+    tweepy_api = tweepy_login(config)
+    # download_users_start(tweepy_api, 'sauricat')
+    download_users_resume_progress(tweepy_api)
@@ -0,0 +1,144 @@
+import json
+import os.path
+import traceback
+import webbrowser
+from distutils.dir_util import copy_tree
+from pathlib import Path
+
+from flask import Flask, send_from_directory, Response
+
+from constants import REPORT_DIR, DEBUG
+from utils import read, write
+
+# Constants
+src_dir = Path(os.path.realpath(__file__)).parent
+
+
+def generate_report() -> str:
+    """
+    Compile the report document and generate a markdown report
+
+    :return: Markdown report
+    """
+    # Load markdown
+    md = read(str(src_dir.joinpath('report_document.md'))).replace('\r\n', '\n').split('\n')
+
+    # Process line by line
+    for i in range(len(md)):
+        line = md[i]
+        if not line.startswith('@include'):
+            continue
+
+        # Process @include statements
+        try:
+            path = line[line.index('`') + 1:]
+            path = path[:path.index('`')]
+            md[i] = read(REPORT_DIR + path)
+
+            # Cut lines
+            # Format: @include-cut `path` <start, inclusive> [end, not inclusive]
+            if line.startswith('@include-cut'):
+                args = [int(i) for i in line.split()[2:]]
+                if len(args) == 1:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:])
+                if len(args) == 2:
+                    md[i] = '\n'.join(md[i].split('\n')[args[0]:args[1]])
+
+            # Specific lines
+            # Format: @include-lines `path` <...lines>
+            # Example: @include-lines `path` 1 2 5
+            if line.startswith('@include-lines'):
+                args = [int(i) for i in line.split()[2:]]
+                lines = md[i].split('\n')
+                lines = [lines[ln] for ln in range(len(lines)) if ln in args]
+                md[i] = '\n'.join(lines)
+
+        # Handle errors. (It prompts "too broad an exception clause" but I actually need to catch
+        # every possible exception.)
+        except Exception as e:
+            md[i] = f"<pre class=\"error\">" \
+                    f"\nInvalid @include statement. \n{traceback.format_exc()}</pre>"
+
+    return '\n'.join(md)
+
+
+def generate_html() -> str:
+    """
+    Generate report then put it into the HTML template
+
+    :return: HTML string
+    """
+    # Generate markdown report and JSON encode it (which works as JS code! amazing
+    md_json = json.dumps({'content': generate_report()})
+    # Inject into HTML
+    html = read(str(src_dir.joinpath('report_page.html'))) \
+        .replace('`{{markdown}}`', md_json)
+    return html
+
+
+def write_html() -> None:
+    """
+    Write HTML and copy files to ./dist
+
+    :return: None
+    """
+    if os.path.isdir('./dist'):
+        os.remove('./dist')
+    Path('./dist/resources').mkdir(parents=True, exist_ok=True)
+    write('./dist/index.html', generate_html())
+
+    copy_tree(str(src_dir.joinpath('resources/').absolute()), './dist/resources')
+    copy_tree(REPORT_DIR, './dist')
+
+
+def serve_report() -> None:
+    """
+    Serve report page in an http server.
+
+    :return: None
+    """
+    # Create flask app
+    app = Flask(__name__)
+    html = generate_html()
+
+    @app.route('/')
+    def root() -> str:
+        """
+        Root webpage. If debug mode is enabled, generate new HTML every time the web page is
+        accessed. Else, serve the generated HTML.
+
+        :return: HTML report
+        """
+        if DEBUG:
+            return generate_html()
+        else:
+            return html
+
+    @app.route('/<path:path>')
+    def res(path: str) -> Response:
+        """
+        Resources endpoint. This maps report queries to the report directory
+
+        :param path: Path of the resource
+        :return: File resource or 404
+        """
+        return send_from_directory(Path(REPORT_DIR).absolute(), path)
+
+    @app.route('/resources/<path:path>')
+    def js_res(path: str) -> Response:
+        """
+        JS Resource endpoint. This maps JS and CSS queries to the resources directory
+
+        :param path: Path of the resource
+        :return: File resource or 404
+        """
+        return send_from_directory(os.path.join(src_dir, 'resources'), path)
+
+    # Run app
+    webbrowser.open("http://localhost:8080")
+    app.run(port=8080)
+
+
+if __name__ == '__main__':
+    write_html()
+    serve_report()
@@ -0,0 +1,213 @@
+
+# Shifting Interest in COVID-19 Twitter Posts
+
+## Introduction
+
+We have observed that there have been increasingly more voices talking about COVID-19 since the start of the pandemic. However, different groups of people might view the importance of discussing the pandemic differently. For example, we don't know whether the most popular people on Twitter will be more or less inclined to post COVID-related content than the average Twitter user. Also, while some audience finds these content interesting, others quickly scroll through them. **So, we aim to compare people's interests in posting coronavirus content and the audience's interests in viewing them between different groups.** Also, with recent developments and policy changes toward COVID-19, it is unclear how people's discussions would react. Some people might believe that the pandemic is starting to end so that discussing it would seem increasingly like an unnecessary effort, while others might find these policy changes controversial and want to voice their opinions even more. Also, even though COVID-related topics are almost always on the news, some news outlets might intentionally cover them more frequently than others. For the people watching the news, some people might find these news reports interesting, while others can't help but switch channels. So, how people's interest in listening or discussing COVID-related topics changes over time is not very clear. **Our second goal is to analyze how people's interest in COVID-related topics changes and how frequently people have discussed COVID-related issues in the two years since the pandemic started.**
+
+# Method
+
+## Demographics
+
+Our data come from three samples:
+
+* `500-pop`: The list of 500 most followed users on Twitter who post in English, Chinese, or Japanese.
+* `500-rand`: A sample of 500 random users on Twitter who post in English, Chinese, or Japanese with at least 1000 posts and at least 150 followers.
+* `eng-news`: A list of 100 top news Twitter accounts by Nur Bremmen [[1]](#ref1), combined with all news accounts which TwitterNews reposted. All of them post in English, and most of them target audience in North America.
+
+We also counted the number of people speaking each language:
+
+@include `/sample-demographics.md`
+
+## Data Collection
+
+1. To create our samples, we collected a wide range of Twitter users using Twitter's get friends list API endpoint [(documentation)](https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list) and the follows-chaining technique. We specified one single user as the starting point, obtained the user's friends list, then we picked 3 random users and 3 most followed users from the friend list, add them to the queue, and start the process again from each of them. Because of Twitter's rate limiting on the get friends list endpoint, we can only obtain a maximum of 200 users per minute, with many of them being duplicates. We ran the program for one day and obtained 224,619 users (852.3 MB decompressed). However, only the username, popularity, post count, and language data are kept after processing (filtering). The processed user dataset `data/twitter/user/processed/users.json` is 7.9 MB in total. We selected our samples by filtering the results first based on language, selected the top 500 most followed users as `500-pop`, filtered the list again based on post count (>1000) and followers (>150), then selected a random sample of 500 users as `500-rand`.
+
+2. We also downloaded all tweets from our sampled users through the user-timeline API [(documentation)](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline). Due to rate limiting, the program took around 16 hours to finish, and we obtained 7.7 GB of raw data (uncompressed). During processing, for each tweet, we extracted only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory `data/twitter/user-tweets/processed/` is 141.6 MB in total.
+
+3. We also used the COVID-19 daily cases data published by New York Times [[3]](#ref3) to compare with peaks and throughs in our frequency over date graph.
+
+## Computation & Filtering
+
+To analyze the frequencies and relative popularity of COVID-related posting either for all posts from a specific user, or for a sample across many users for a specific date, we defined several formulas. First, we need to define many terms we will use in the following sections:
+
+* **Frequency**: The percentage of COVID-related posts compared to all posts, showing how frequent COVID-related content are posted.
+* **Popularity**: The integer value representing the popularity of a post, measured by the total number of user interactions on a post, which is the number of likes and comments on a tweet combined.
+* **Popularity Ratio**: The relative popularity between 0 and infinity calculating how popular are a user's COVID-posts compared to all the user's posts, which is a ratio of the average popularity of COVID-posts over all posts. If COVID-posts are more popular, then this value should be greater than 1, and if they are less popular, this value should be less than 1. Since follower count and interaction rate differs wildly between users, we cannot assume that popularity is comparable between users, so popularity is only compared within a user, while popularity ratio can be compared across users.
+
+### 1. Computation - User Analysis
+
+In the first section, we used the following formulas to calculate statistical distributions of the frequencies and popularity ratios of users in a sample:
+
+<blockquote>
+$$ \text{freq}_{u} = \frac{|\text{COVID-posts by } u|}{|\text{All posts by } u|} $$
+</blockquote>
+
+<blockquote>
+$$ \text{pop_ratio}_{u} = \left(\frac{\sum\text{Popularity of COVID-posts by } u}{|\text{COVID-posts by } u|}\right) / \left(\frac{\sum \text{Popularity of all posts by } u}{|\text{All posts by } u|}\right) $$
+</blockquote>
+
+The frequency equation can divide by zero if the user has zero posts, and it is logical to assign the frequency to 0 when the user didn't post anything. However, it is not sensible to assign the popularity ratio to zero when the pop_ratio equation divides by zero. There are three divisions in the pop_ratio equation, so there are three possible places where it might divide by zero. To prevent division by zero, people who didn't post about COVID-19, who didn't post anything at all, and who have literally 0 popularity on any of their posts are ignored. In our data, this amount of people are ignored for each sample:
+
+@include `/pop/ignored.md`
+
+Then, the users' results are graphed in one histogram for each sample to gain some insights about the distribution of user frequencies. However, there are many outliers and more than half who posted below 0.1% for two of our samples, making the graphs unreadable: (You can click on the images to enlarge them, and hold down E to view full screen)
+
+<div class="image-row">
+    <div><img src="/freq/500-pop-hist-outliers.png" alt="hist"></div>
+    <div><img src="/freq/500-rand-hist-outliers.png" alt="hist"></div>
+    <div><img src="/freq/eng-news-hist-outliers.png" alt="hist"></div>
+</div>
+
+For example, even though most of `500-rand` are concentrated below 10%, the x-axis scale is stretched to 50% by many outliers who post more than 40%:
+
+@include-cut `/freq/500-rand-top-20.md` 0 8
+
+To resolve this, the outliers are removed both for frequencies and popularity ratios using the method proposed by Boris Iglewicz and David Hoaglin (1993) [[2]](#ref2), and for frequencies, everyone who posted below 0.1% are ignored when graphing histograms. They are not ignored in statistic calculations.
+
+### 2. Computation - Change Analysis
+
+The second section analyzes data separate for each of our samples, just like the first section. However, unlike how calculations are separated for each user in the first section, the second section separates calculation by date and combines users in a sample. We defined the start of COVID-19 as _2020-01-01_ and ignored all posts prior to this date. Then, the average frequency and popularity ratio are calculated for every day since _2020-01-01_. This calculation gave us a list `freqs` and a list `pops` where, for every date `dates[i]`,
+
+<blockquote>
+$$ \text{freq}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on date}_{i}|} $$
+</blockquote>
+
+<blockquote>
+$$ \text{pop_ratio}_i = \frac{ \sum_{u \in \text{Users}} \left(\frac{\sum\text{Popularity of u's COVID-posts on date}_i}{|\text{u's COVID-posts on date}_i| \cdot (\text{Average popularity of all u's posts})}\right)}{(\text{Number of users posted on date}_i)} $$
+</blockquote>
+
+After calculation, `freqs` and `pops` are plotted in line graphs against `dates`. Initially, we are seeing graphs with very high peaks such as the graph below. After some investigation, we found that these peaks are caused by not having enough tweets on each day to average out the random error of one single popular tweet. For example, in the graph below, we adjusted the program to print different users' popularity ratios when we found an average popularity ratio of greater than 20, which produced the output on the right. As it turns out, on 2020-07-11, the user @juniorbachchan posted that he and his father tested positive, and that single post is 163.84 times more popular than the average of all his posts. (The post is linked [here](https://twitter.com/juniorbachchan/status/1282018653215395840), it has 235k likes, 25k comments, and 32k retweets). Even though these data points are outliers, there isn't an effective way of removing them since we don't have enough tweets data from each user to calculate their range (for example, someone's COVID-related post might be the only one they've posted). So, we've decided to limit the viewing window to `y = [0, 2]` as shown in the graph on the right.
+
+<div class="image-row">
+    <div><img src="resources/peak-1.png" alt="graph"></div>
+    <div style="display: flex; flex-direction: column; justify-content: center"><pre>
+Date:  2020-07-11
+- JoeBiden 1.36
+<span class="highlight">- juniorbachchan 163.84</span>
+- victoriabeckham 0.80
+- anandmahindra 7.66
+- gucci 0.13
+- StephenKing 0.61
+    </pre></div>
+    <div><img src="resources/peak-2.png" alt="graph"></div>
+</div>
+
+Then, we encountered the issue of noise. When we plot the graph without a filter, we found that the graph is actually very noisy. We decided to average the results over 7 days. Then, we also experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`.
+
+<div class="image-row">
+    <div><img src="/change/n/5.png" alt="graph"></div>
+    <div><img src="/change/n/10.png" alt="graph"></div>
+    <div><img src="/change/n/15.png" alt="graph"></div>
+</div>
+
+# Results
+
+## User Analysis
+
+This section ignores date and focuses on user differences within our samples, which will answer the first part of our research question: **how frequently does people post about COVID-related issues, and how interested are people to see COVID-related posts?**
+
+### 1. User Posting Frequency
+
+First, the users' COVID-related posting frequency in these three datasets are analyzed. Initially, we were expecting that most people will post coronavirus content because this pandemic is very relevant to everyone. However, there are many people in our samples didn't post about COVID-19 at all. The following table shows how many people in each sample didn't post or posted less than 1% about COVID-19:
+
+@include `/freq/didnt-post.md`
+
+The `eng-news` sample has the lowest number of users who didn't have COVID-related posts, the `500-rand` sample has the highest, while `500-pop` sits in between. This large difference between `eng-news` and the rest can be explained by the news channels' obligation to report news, which includes news about new outbreaks, progress of vaccination, new cross-border policies, etc. Also, `500-pop` has much more users who posted COVID-related content than `500-rand`, while they have similar amounts of users posting less than 1%. This finding might be explained by how influential people have more incentive to express their support toward slowing the spread of the pandemic than regular users, which doesn't require frequent posting like news channels.
+
+Then, the calculated frequency data for each user in a sample are graphed in histograms:
+
+<div class="image-row">
+    <div><img src="/freq/500-pop-hist.png" alt="hist"></div>
+    <div><img src="/freq/500-rand-hist.png" alt="hist"></div>
+    <div><img src="/freq/eng-news-hist.png" alt="hist"></div>
+</div>
+
+As expected, the distributions looks right-skewed, with most people posting not very much. One interesting distinction is that, even though the distributions follow similar shapes, the x-axis ticks of `eng-news` is actually ten times larger than the other two, which means that `eng-news` post a lot more about COVID-19 on average than the other two samples. Statistics of the samples are calculated to further verify these insights:
+
+@include-lines `/freq/stats.md` 0 1 4 5 6 7
+
+Since there are many outliers, medians and IQR will more accurately represent the center and spread of this distribution. As these numbers show, `eng-news` do post much more (a 6.1% increment in post frequency, or a 406.7% increase) than the other two samples. Again, this can be explained by the news channels' obligation to report news related to COVID-19 or to promote methods to slow the spread of the pandemic. These means also shows that 50% of average Twitter users dedicate below 1.5% of their timeline to COVID-related posts.
+
+### 2. User Popularity Ratios
+
+Similar histograms are graphed and statistics are calculated for user's popularity ratios in their sample, calculated using the formula described in the methods section:
+
+<div class="image-row">
+    <div><img src="/pop/500-pop-hist.png" alt="hist"></div>
+    <div><img src="/pop/500-rand-hist.png" alt="hist"></div>
+    <div><img src="/pop/eng-news-hist.png" alt="hist"></div>
+</div>
+
+Looking at the histograms, while `eng-news` is roughly symmetric, the other two distributions are right skewed. 
+
+@include-lines `/pop/stats.md` 0 1 4 5 6 7
+
+The calculated medians show that the audience normally don't like or comment on COVID-related posts as much as other posts by all three groups, which implies that people aren't as interested in these posts. The average Twitter user's and the average English news channel's COVID-posts has only 87% of the popularity compared to their other posts, while the average `500-pop` user has only 69% of the popularity. This difference is possibly because the most popular users' audience probably followed them for the specific types of content that only they can post, and not general COVID-related content that anyone can post similarly. 
+
+Also, even though the medians for `500-rand` and `eng-news` are the same, since the `500-rand` distribution is right skewed, its 25th percentile is much lower—25% of average Twitter users' COVID-posts are only 34% as popular as their other posts. 
+
+## Change Analysis
+
+After we answered how frequently people posted about COVID-19 and how interested are people to view these posts, we analyze our data over the posting dates to answer the second part of our research question: **How does posting frequency and people's interests in COVID-19 posts changes from the beginning of the pandemic to now?**
+
+### 1. Posting Frequency Over Time
+
+We graphed the posting frequencies of our three samples in line graphs with the x-axis being the date with labels representing the month, which gave us the following graphs:
+
+<div class="image-row">
+    <div><img src="/change/freq/500-pop.png" alt="graph"></div>
+    <div><img src="/change/freq/500-rand.png" alt="graph"></div>
+    <div><img src="/change/freq/eng-news.png" alt="graph"></div>
+</div>
+
+Looking at three graphs individually, the posting rates were almost zero during the first two month when COVID-19 first started for all three samples, which is expected because no one knew how devastating it will be at that time. Then, all three samples had a peak in posting frequencies from March to June 2020. After June 2020, the posting rate for both `500-rand` and `eng-news` declined to around 1/3 of the peak, with `500-pop` declining slightly as well. While the reason to this decline is unclear, we speculate that it might be caused by people's loss of interest in the topic as they realize COVID-19 isn't going to be a disaster that fades away quickly, or as the news became less "breaking" and information started to repeat. Like the selective attention theory of cognitive psychology, people's attention to one thing comes at the expense of others since our attention is very limited. So people might have chosen to direct more attention to living rather than paying attention to the coronavirus that didn't seem to go away soon. Also, similar to how people will unconsciously learn to ignore repeated background noise after moving to a new environment (in a process called habituation), they might have learned to ignore the repeated information about COVID-19, which will lead to less COVID-related posting. Further research can determine whether this three-month attention span can generalize to other long-term disaster other than COVID-19.
+
+After June 2020, `500-rand` continued declining steadily without major peaks, while `eng-news` had a smaller peak around Dec 2020 and a trough after June 2021, and `500-top` had many peaks and toughs after. In an effort to interpret these peaks, we overlapped the three charts with the data of new COVID-19 cases in the U.S. published by New York Times [[3]](#ref3), which gave us the following graph: 
+
+<div class="image-row">
+    <div><img src="/change/comb/freq.png" alt="graph" class="large"></div>
+</div>
+
+In this graph, we can see that the peak around Dec 2020 and the trough around Jun 2021 in `eng-news` and `500-pop` actually correspond very closely with the rise and fall of new cases in the U.S., which is reasonable because there are more sensational news to report and more COVID-related events happening to popular individuals when cases are high. However, even though the first peak in cases around August 2020 did correlate with a peak in `500-rand`, the rise and fall of cases in the U.S. doesn't seem to affect `500-rand` overall. This is possibly because we included three languages in the population of our random sample, which means that `500-rand` isn't limited to English-speaking accounts that mostly target the U.S. audience like `eng-news`.
+
+### 2. Popularity Ratio Over Time
+
+We graphed a similar graph with popularity ratio being the y-axis over date as the x-axis, as shown below:
+
+<div class="image-row">
+    <div><img src="/change/comb/pop.png" alt="graph" class="large"></div>
+</div>
+
+Despite efforts to filter out noise or normalize the graph discussed in the [method](#method) section, we did not find any patterns in the resulting graph. The peaks and troughs of each line seems random, and the three lines did not have common peaks or troughs that might reveal meaningful insights. The raw data looks very much like random noise as well. This lack of meaningful information is possibly because our sample size is comparatively small—even though we have 500 users in our `500-pop` sample, the sample size for tweets by these users on one specific day is very small. For example, there are only 6 users in `500-pop` who posted on `2020-07-11`. This lack of samples amplified the effect of randomness, and more data may be needed to reduce the effect of one tweet on the popularity ratio for the specified date. Unfortunately, we have to reach a conclusion that more data is needed to reveal interesting findings.
+
+# Conclusion
+
+In summary, key findings in our research include that while news channels post about COVID-19 more frequently (Median = 7.6%), average Twitter users and most popular users don't post very much (Median ≤ 1.5%). And while COVID-posting frequencies for `eng-news` and `500-pop` fluctuates with the number of new cases in the U.S., average Twitter users' COVID-posting frequency dropped and continued to decrease since Jun 2020. And these posts were not as popular (not liked or commented as much) as users' other posts (Median ≤ 0.87).
+
+These findings might not be surprising, but they might have again demonstrated people's ability to adapt to new environments. The duration of the sensational effect of the start of COVID-19 might be similar to the grief from losing something important, they all fade over time as we adapt to them. Even though people focused a lot of attention on COVID-19 when new information first became available from March 2020, people's interest in these topics decreased as we adapt to the new norm with COVID-19 in three months, demonstrated by the quickly decreasing posting rates. Or, for the audience, rather than liking or commenting on COVID-19 posts, they might have quickly scrolled through them in favor of more interesting posts. It is fascinating that we can learn to adapt to such a devastating change in our environment in only three months.
+
+## TODO
+
+* [ ] Frequency/time: Maybe there's a reason to the May 2021 peak?
+* [ ] Followers (x) vs COVID-related posts (y) scatter plot, each point is a user
+
+# References
+
+<a id="ref1"></a>
+
+[1] Bremmen, N. (2010, September 3). The 100 most influential news media twitter accounts. _Memeburn_. Retrieved November 27, 2021, from https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/.
+
+<a id="ref2"></a>
+
+[2] Iglewicz, Boris, & David Hoaglin (1993), "Volume 16: How to Detect and
+Handle Outliers", _The ASQC Basic References in Quality Control:
+Statistical Techniques_, Edward F. Mykytka, Ph.D., Editor.
+
+<a id="ref3"></a>
+
+[3] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved November 27, 2021, from https://github.com/nytimes/covid-19-data.
+
+<a id="ref4"></a>
+
+[4] WHO. (n.d.) _Listings of WHO's Response to COVID-19._ World Health Organization. Retrieved November 27, 2021, from https://www.who.int/news/item/29-06-2020-covidtimeline.
@@ -0,0 +1,63 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>CSC110 Report</title>
+    <link rel="stylesheet" href="resources/style.css">
+</head>
+<body>
+<div id="content">
+
+</div>
+<script src="resources/marked.min.js"></script>
+<script src="resources/jquery.min.js"></script>
+<script src="resources/polyfill.es6.min.js"></script>
+<script src="resources/mathjax-tex-mml-chtml.js"></script>
+
+<script>
+
+// Python will inject the markdown code here.
+markdown = `{{markdown}}`
+
+document.getElementById('content').innerHTML =
+    marked.parse(markdown.content);
+
+// Make images clickable
+// Improved from: https://stackoverflow.com/a/50430187/7346633
+body = $('body')
+$('img').addClass('clickable').click(function() {
+    const src = $(this).attr('src');
+    let modal;
+
+    function removeModal() {
+        modal.remove();
+        body.off('keyup.modal-close');
+    }
+
+    modal = $('<div id="modal">').css({
+        background: 'RGBA(0,0,0,.5) url(' + src + ') no-repeat center',
+        backgroundSize: $(this).hasClass('large') ? 'contain' : 'auto',
+        width: '100vw',
+        height: '100vh',
+        position: 'fixed',
+        zIndex: '100',
+        top: '0',
+        left: '0',
+        cursor: 'zoom-out'
+    }).click(function() {
+        removeModal();
+    }).appendTo('body');
+
+    // Handling keyboard shortcuts
+    body.on('keyup.modal-close', (e) => {
+        if (e.key === 'Escape') removeModal();
+        if (e.key === 'e') modal.removeClass('zoom')
+    });
+    body.on('keydown.modal-close', (e) => {
+        if (e.key === 'e') modal.addClass('zoom')
+    })
+});
+
+</script>
+</body>
+</html>
@@ -0,0 +1,45 @@
+from process.twitter_process import *
+from raw_collect.twitter import *
+from utils import *
+
+if __name__ == '__main__':
+    # conf = load_config('config.json5')
+    # api = tweepy_login(conf)
+    # print(json_stringify(api.get_user(screen_name="sauricat")._json, indent=2))
+
+    # keywords = '⚧; mtf; ftm; transgender; 药娘; 🍥; they/them'.split('; ')
+    #
+    # base_dir = './data/twitter/user'
+    #
+    # users = []
+    #
+    # # for f in ['NASAspaceplace.json']:
+    # for f in os.listdir(f'{base_dir}/users'):
+    #     s = read(f'{base_dir}/users/{f}')
+    #     j = json.loads(s)
+    #     s = ''.join(j[k] for k in ['name', 'description'])
+    #     if any(k in s.lower() for k in keywords):
+    #         # print([k in s.lower() for k in keywords])
+    #         print(f)
+    #         users.append((j['screen_name'], j['name'], j['description'], j['followers_count']))
+    #
+    # write('trans.json', json_stringify(users, 2))
+    # print(len(users))
+    # time.sleep(5)
+
+    # print(get_user_popularity_ranking('danieltosh'))
+
+    # for f in os.listdir(f'{USER_DIR}/users'):
+    #     os.rename(f, f.lower())
+
+    # combine_tweets_for_sample(['abc', 'wsj'], 'test')
+
+    start = time.time()
+    for i in range(1000000):
+        dateutil.parser.isoparse('2020-01-01T01:01:01')
+    print(f'dateutil.parser.isoparse took {time.time() - start:.2f} seconds')
+
+    start = time.time()
+    for i in range(1000000):
+        parse_date('2020-01-01T01:01:01')
+    print(f'parse_date took {time.time() - start:.2f} seconds')
@@ -0,0 +1,391 @@
+"""This module contains useful functions and classes, including:
+- debug messages
+- file I/O
+- statistics functions, removing outliers and averaging values over a period
+- date-related functions
+- classes for configs, reports, statistics, and JSON"""
+
+import dataclasses
+import inspect
+import json
+import os
+import statistics
+from dataclasses import dataclass
+from datetime import datetime, date, timedelta
+from pathlib import Path
+from typing import Union, NamedTuple, Any, Generator
+
+import json5
+import numpy as np
+from tabulate import tabulate
+
+from constants import REPORT_DIR, DEBUG
+
+
+@dataclass
+class Config:
+    """
+    Secrets configuration for this program.
+
+    Attributes:
+      - consumer_key: The consumer key from the Twitter application portal
+      - consumer_secret: The consumer secret from the Twitter application portal
+      - access_token: The access token of an app from the Twitter application portal
+      - access_secret: The access secret of an app from the Twitter application portal
+
+    Representation Invariants:
+      - self.consumer_key != ''
+      - self.consumer_secret != ''
+      - self.access_token != ''
+      - self.access_secret != ''
+    """
+    # Twitter's official API v1 keys
+    consumer_key: str
+    consumer_secret: str
+    access_token: str
+    access_secret: str
+
+
+def load_config(path: str = 'config.json5') -> Config:
+    """
+    Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.
+
+    :param path: Path of the config file (Default: config.json5)
+    :return: Config object
+    """
+    if os.path.isfile(path):
+        with open(path, 'r', encoding='utf-8') as f:
+            conf = json5.load(f)
+    else:
+        conf = json5.loads(os.getenv('config'))
+
+    return Config(**conf)
+
+
+def debug(msg: object) -> None:
+    """
+    Output a debug message, usually from another function
+
+    :param msg: Message
+    """
+    if DEBUG:
+        caller = inspect.stack()[1].function
+        print(f'[DEBUG] {caller}: {msg}')
+
+
+def calculate_rate_delay(rate_limit: float) -> float:
+    """
+    Calculate the rate delay for each request given rate limit in request per minute
+
+    :param rate_limit: Rate limit in requests per minute
+    :return: Rate delay in seconds per request
+    """
+    return 1 / rate_limit * 60
+
+
+def write(file: str, text: str) -> None:
+    """
+    Write text to a file
+
+    :param file: File path (will be converted to lowercase)
+    :param text: Text
+    :return: None
+    """
+    file = file.lower().replace('\\', '/')
+
+    if '/' in file:
+        Path(file).parent.mkdir(parents=True, exist_ok=True)
+
+    with open(file, 'w', encoding='utf-8') as f:
+        f.write(text)
+
+
+def read(file: str) -> str:
+    """
+    Read file content
+
+    :param file: File path (will be converted to lowercase)
+    :return: None
+    """
+    with open(file.lower(), 'r', encoding='utf-8') as f:
+        return f.read()
+
+
+class Reporter:
+    """
+    Report file creator
+
+    Attributes:
+      - report: The string of the report
+      - file: Where the report is stored
+    """
+    report: str
+    file: str
+
+    def __init__(self, file: str) -> None:
+        self.report = ''
+        self.file = os.path.join(REPORT_DIR, file)
+
+    def print(self, line: str = '', arg: Any = None, autosave: bool = True) -> None:
+        """
+        Add a line to the report
+
+        :param line: Line content
+        :param arg: Additional argument
+        :param autosave: Save automatically
+        :return: self (this is for call chaining, this way you can call Reporter.print().save()
+        """
+        self.report += line
+        if arg is not None:
+            self.report += str(arg)
+        self.report += '\n'
+        if autosave:
+            self.save()
+
+    def save(self) -> None:
+        write(self.file, self.report)
+
+    def table(self, table: list[list[str]], headers: list[str], header_code: bool = False) -> None:
+        """
+        Report a table
+
+        :param table: Table data
+        :param headers: Headers
+        :param header_code: Whether the headers should be code-formatted
+        :return: None
+        """
+        if header_code:
+            headers = [f'`{s}`' for s in headers]
+        self.print(tabulate(table, headers, tablefmt='github'))
+
+
+def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float]:
+    """
+    Create list with outliers removed for graphing
+
+    Credit to: https://stackoverflow.com/a/11886564/7346633
+
+    :param points: Input points list
+    :param z_threshold: Z threshold for identifying whether or not a point is an outlier
+    :return: List with outliers removed
+    """
+    x = np.array(points)
+    if len(x.shape) == 1:
+        x = x[:, None]
+    median = np.median(x, axis=0)
+    diff = np.sum((x - median) ** 2, axis=-1)
+    diff = np.sqrt(diff)
+    med_abs_deviation = np.median(diff)
+
+    modified_z_score = 0.6745 * diff / med_abs_deviation
+
+    is_outlier = modified_z_score > z_threshold
+
+    return [points[v] for v in range(len(x)) if not is_outlier[v]]
+
+
+@dataclass()
+class Stats:
+    """
+    Data class storing the statistics of a sample
+
+    Attributes:
+      - mean: The average of the sample
+      - stddev: The standard deviation
+      - median: The median value of the sample, or the 50th percentile
+      - iqr: The interquartile-range (75th percentile - 25th percentile)
+      - q25: The first quartile, or the 25th percentile
+      - q75: The third quartile, or the 75th percentile
+    """
+    mean: float
+    stddev: float
+    median: float
+    iqr: float
+    q25: float
+    q75: float
+
+
+def get_statistics(points: list[float]) -> Stats:
+    """
+    Calculate statistics for a set of points
+
+    :param points: Input points
+    :return: Statistics
+    """
+    q75, q25 = np.percentile(points, [75, 25])
+    iqr = q75 - q25
+    return Stats(statistics.mean(points), statistics.stdev(points), statistics.median(points),
+                 iqr, q25, q75)
+
+
+def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]:
+    """
+    Create a table structure from statistics for tabulate
+
+    :param stats: Statistics
+    :param percent: Whether the numbers are percentages
+    :return: Table for tabulate
+    """
+    def num(n: float) -> str:
+        return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
+
+    return [['Mean'] + [num(s.mean) for s in stats],
+            ['StdDev'] + [num(s.stddev) for s in stats],
+            ['Median'] + [num(s.median) for s in stats],
+            ['IQR'] + [num(s.iqr) for s in stats],
+            ['Q1 (25%)'] + [num(s.q25) for s in stats],
+            ['Q3 (75%)'] + [num(s.q75) for s in stats],
+            ]
+
+
+def parse_date_time(iso: str) -> datetime:
+    """
+    Parse date faster. Running 1,000,000 trials, this parse_date function is 4.03 times faster than
+    python's built-in dateutil.parser.isoparse() function.
+
+    Preconditions:
+      - iso is the output of datetime.isoformat() (In a format like "2021-10-20T23:50:14")
+      - iso is a valid date (this function does not check for the validity of the input)
+
+    :param iso: Input date
+    :return: Datetime object
+    """
+    return datetime(int(iso[:4]), int(iso[5:7]), int(iso[8:10]),
+                    int(iso[11:13]), int(iso[14:16]), int(iso[17:19]))
+
+
+def parse_date_only(iso: str) -> datetime:
+    """
+    Parse date faster.
+
+    Preconditions:
+      - iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20")
+      - iso is a valid date (this function does not check for the validity of the input)
+
+    :param iso: Input date
+    :return: Datetime object
+    """
+    return datetime(int(iso[:4]), int(iso[5:7]), int(iso[8:10]))
+
+
+def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], None, None]:
+    """
+    Date range for looping, excluding the end date
+
+    :param start_date: Start date in "YYYY-MM-DD" format
+    :param end_date: End date in "YYYY-MM-DD" format
+    :return: Generator for looping through the dates one day at a time.
+    """
+    start = parse_date_only(start_date)
+    for n in range(int((parse_date_only(end_date) - start).days)):
+        dt = start + timedelta(n)
+        yield dt.strftime('%Y-%m-%d'), dt
+
+
+def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
+                 default: float = 0) -> list[float]:
+    """
+    Takes y-axis data in the form of a mapping of date to values, and returns a list of all the
+    values mapped to the date in dates. If a date in dates isn't in y, then the default values is
+    used instead.
+
+    Preconditions:
+      - The date in dates must be in the same format as the dates in the keys of y
+
+    :param y: Y axis data (in the format y[date] = value)
+    :param dates: Dates
+    :param default: Default data if y doesn't exist on that date
+    :return: A list of y data, one over each day in dates
+    """
+    return [y[d] if d in y else default for d in dates]
+
+
+def filter_days_avg(y: list[float], n: int) -> list[float]:
+    """
+    Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
+
+    Precondition:
+      - n % 2 == 1
+      - len(y) > 0
+
+    :param y: Values
+    :param n: Number of days, must be odd
+    :return: Averaged data
+    """
+    if n <= 1:
+        return y
+    if n % 2 != 1:
+        ValueError(f'n must be odd (you entered {n})')
+
+    # Sliding window; maintain a sum of an interval centered around i
+    # if the interval exceeds the beginning/end, pretend that the first/last elements are "extended"
+
+    radius = n // 2
+    current_sum = (radius + 1) * y[0]  # current sum is sum(y[-r:0] + y[0:1] + y[1:r + 1])
+    for i in range(radius):
+        i = min(i, len(y) - 1)
+        current_sum += y[i]  # adding the values in y[1:r + 1]
+
+    ret = []
+    for i in range(len(y)):
+        l, r = i - radius, i + radius
+        l = max(0, l)  # avoid index out of bounds by "extending" first/last element
+        r = min(r, len(y) - 1)
+        current_sum += y[r]  # extend sliding window
+        ret.append(current_sum / n)
+        current_sum -= y[l]  # remove old values
+    return ret
+
+
+def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]:
+    """
+    Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)
+
+    Preconditions:
+      - len(numerator) == len(denominator)
+
+    :param numerator: Numerator
+    :param denominator: Denominator
+    :return: A list where list[i] = numerator[i] / denominator[i]
+    """
+    output = np.zeros(len(numerator), float)
+    for i in range(len(numerator)):
+        if denominator[i] == 0:
+            output[i] = 0
+        else:
+            output[i] = numerator[i] / denominator[i]
+    # This marks it as incorrect type but it's actually not incorrect type, just because numpy
+    # doesn't specify its return types
+    return output.tolist()
+
+
+class EnhancedJSONEncoder(json.JSONEncoder):
+    def default(self, o):
+
+        # Support encoding dataclasses
+        # https://stackoverflow.com/a/51286749/7346633
+        if dataclasses.is_dataclass(o):
+            return dataclasses.asdict(o)
+
+        # Support encoding datetime
+        if isinstance(o, (datetime, date)):
+            return o.isoformat()
+
+        # Support for sets
+        # https://stackoverflow.com/a/8230505/7346633
+        if isinstance(o, set):
+            return list(o)
+
+        return super().default(o)
+
+
+def json_stringify(obj, indent: Union[int, None] = None) -> str:
+    """
+    Serialize json string with support for dataclasses and datetime and sets and with custom
+    configuration.
+
+    :param obj: Objects
+    :param indent: Indent size or none
+    :return: Json strings
+    """
+    return json.dumps(obj, indent=indent, cls=EnhancedJSONEncoder, ensure_ascii=False)
@@ -0,0 +1,69 @@
+\documentclass[fontsize=11pt]{article}
+\usepackage{amsmath}
+\usepackage[utf8]{inputenc}
+\usepackage[margin=0.75in]{geometry}
+
+\title{CSC110 Project Proposal: COVID-19 Discussion Trend Analysis}
+\author{Azalea Gui \& Peter Lin}
+\date{Friday, November 5, 2021}
+\usepackage[
+backend=biber,
+style=numeric,
+citestyle=apa,
+sorting=nyt
+]{biblatex}
+\addbibresource{references.bib}
+\DeclareNameAlias{author}{last-first}
+
+\begin{document}
+    \maketitle
+
+    \section*{Problem Description and Research Question}
+
+    \indent
+
+    We have observed that there have been increasingly more voices talking about COVID-19 since the start of the pandemic. However, with recent policy changes in many countries aiming to limit the effect of COVID-19, it is unclear how people’s discussions would react. Some people might be inclined to believe that the pandemic is starting to end so that discussing it would seem increasingly like an unnecessary effort. In contrast, others might find these policy changes controversial and want to voice their opinions on them even more. Also, even though COVID-related topics are almost always on the news, some news outlets might intentionally cover them more frequently than others. For the people watching the news, some people might find these news reports interesting, while others can’t help but switch channels. So, how people’s interest in listening about or discussing COVID-related topics changes over time is not very clear. \textbf{Our goal is to analyze how people’s interest in COVID-related topics changes and how frequently people have discussed COVID-related issues in the two years since the pandemic started.} Also, different social media platforms might induce people to view the pandemic differently. For example, we don’t know whether people on open social media platforms such as Twitter, where everyone can view your posts, might be more or less inclined to post or COVID-related content than people on closed social media platforms such as Instagram, Wechat, or Telegram. Also, people or news outlets with different numbers of followers or viewers might have different inclinations too. \textbf{So, we also aim to compare people’s interests in posting about COVID-related topics between platforms and popularity.}
+
+    \section*{Dataset Description}
+
+    \indent
+
+    Our data will come from individuals’ discussions on many social media or chatting platforms. Some social media platforms provide complete APIs, such as Twitter, which is why we plan to use Twitter as our primary data source. We will gather and analyze textual data from the tweets of famous or important individuals and compare data between individuals. We will also combine these data into meaningful groups to find if any grouping traits will produce meaningful differences. We will also analyze tweets from random individuals to create a broader image that can be generalized to the entire platform’s social environment. We have attached the processed data from twitter user \texttt{voxdotcom} as an example. All data containing tweet contents will not be included in our final report. Our final report will only be based on whether or not the sample is COVID-related.
+
+    Another group of data will come from significant news publishers such as New York Times or Guardian News. We plan only to use the title, publishing date, and publisher of the news reports, and we will use the title only to determine whether or not the news report is COVID-related.
+
+    We are also curious about the frequency of discussion in other countries not relying on Twitter, such as in China, where the government blocks Twitter from the internet. So, we will also analyze news and media from China. We will gather data from popular Chinese telegram channels as well.
+
+    We also plan to gather countries’ confirmed case data from Johns Hopkins CSSE because plotting discussion frequency against confirmed cases might be more meaningful than plotting it against date.
+
+    \section*{Computational Plan}
+
+    \subsection*{Data Gathering}
+
+    \indent
+
+    We plan to transform different platforms’ user posting data, all with unique formats, into data in a platform-independent data model to store and compare. When processing social media data, we will convert platform-dependent keywords such as \texttt{favorites}, \texttt{retweets}, or \texttt{full\_text} on Twitter and \texttt{content}, \texttt{views}, or \texttt{comments} on Telegram into our unique platform-independent model with keywords such as \texttt{popularity} and \texttt{text}. And we will store all processed data in \textbf{JSON} before analysis. As for the raw data from different social media platforms, we plan to gather Twitter data using the \textbf{Tweepy} library and Telegram channels data using \textbf{python-telegram-bot}. However, unfortunately, there are no known libraries for Wechat Moments. We will try to obtain Wechat data through package capture using pyshark, but that might not be successful.
+
+    For news outlet data, we plan to use \textbf{requests} to obtain raw HTML from different listing sites, extract news articles’ titles, publishers, and publishing dates with \textbf{regex}, and store them using JSON. We will convert different HTML formats from different news publishers’ sites into our platform-independent news model.
+
+    We also use the \textbf{Json5} library to parse configurations and API keys of our data gathering and analysis programs.
+
+    \subsection*{Data Analysis/Visualization}
+
+    \indent
+
+    We plan to use \textbf{matplotlib} to create data images or \textbf{plotly} to create websites for data visualization. We plan to use \textbf{NumPy} for statistical calculations.
+
+    To identify whether or not some article is about COVID, we currently use a keyword search. However, a keyword search might not be accurate when COVID has became such an essential background to our society (i.e. many articles with the word COVID in them are about something else). We might experiment with training a binary classification model with \textbf{Keras} and \textbf{scikit-learn} to better classify COVID articles. We might also experiment with training autoencoders with vectorized word occurence data in an COVID-related article to find if there are significant categories within COVID articles (i.e. some COVID articles might be about new COVID policies, and others might just be general updates relating to COVID, and this might be an important insight because people's interests in these different types of COVID articles might differ).
+
+    The primary type of graph we will use will be a frequency histogram——an individual or a group of data’s frequency of mentioning COVID-related topics will be graphed against the date from January 1, 2020, to Nov 1, 2021. We will experiment with group sizes and classification methods to find which variables influence the frequency and which don’t. (For example, we will group individuals by popularity and compare between groups to find if popularity impacts the frequency they mention COVID-related topics). We also plan to overlay these charts in comparison to visualize the statistical differences better.
+
+    Another variant of the frequency histogram will be plotted not against the date but against the country’s confirmed cases since people’s emotions of anxiety might be influenced by the growing or decreasing of confirmed cases. We will also graph some data using this variant to find more insights.
+
+
+    % \section*{References}
+    % Generate references automatically from references.bib.    
+    \nocite{*}
+    \printbibliography
+
+\end{document}
@@ -0,0 +1,14 @@
+ @misc{matplotlib, title={Overview}, url={https://matplotlib.org/stable/contents.html}, journal={Overview - Matplotlib 3.4.3 documentation}, author={Hunter, John and Droettboom , Michael and Firing, Eric and Dale, Darren}, year={2021}, month={Aug}} 
+ @misc{plotly, title={Plotly python graphing library}, url={https://plotly.com/python/}, journal={Plotly}, publisher={Plotly Technologies Inc.}, author={Plotly}, year={2015}} 
+ @misc{json5, title={JSON5}, url={https://pypi.org/project/json5/}, journal={PyPI}, author={Pranke, Dirke}, year={2021}} 
+ @misc{tweepy, title={Tweepy documentation}, url={https://docs.tweepy.org/en/stable/}, journal={Tweepy Documentation - tweepy 4.3.0 documentation}, author={Roesslein, Joshua}, year={2021}} 
+ @misc{numpy, title={NumPy v1.21 manual}, url={https://numpy.org/doc/stable/}, journal={Overview - NumPy v1.21 Manual}, author={Numpy}, year={2021}} 
+ @misc{telegram, title={Welcome to python telegram bot's documentation!}, url={https://python-telegram-bot.readthedocs.io/en/stable/}, journal={python}, author={Toledo, Leandro}, year={2021}} 
+ @misc{keras,
+   title        = {Keras API Reference},
+   journal      = {Keras API Reference},
+   url          = {https://keras.io/api/},
+   author       = {Keras},
+   year         = {2015}
+ }
+ @misc{sklearn, title={scikit-learn 1.0.1 documentation}, url={https://scikit-learn.org/stable/modules/classes.html}, journal={API Reference - scikit-learn 1.0.1 documentation}, author={scikit-learn}, year={2010}} 
@@ -0,0 +1,81 @@
+\documentclass{article}
+\usepackage{amsmath}
+\usepackage[utf8]{inputenc}
+\usepackage[margin=0.75in]{geometry}
+\usepackage{hyperref}
+
+\title{CSC110 Project: COVID-19 Discussion Trend Analysis}
+\author{Azalea Gui \& Peter Lin}
+\date{December 5, 2021}
+\usepackage[
+backend=biber,
+style=numeric,
+citestyle=apa,
+sorting=nyt
+]{biblatex}
+\addbibresource{references.bib}
+\DeclareNameAlias{author}{last-first}
+
+\newcommand{\C}{\texttt}
+
+\begin{document}
+    \maketitle
+
+    \section{Problem Description and Research Question}
+    \indent
+
+    We have observed that there have been increasingly more voices talking about COVID-19 since the start of the pandemic. However, different groups of people might view the importance of discussing the pandemic differently. For example, we don't know whether the most popular people on Twitter will be more or less inclined to post COVID-related content than the average Twitter user. Also, while some audience finds these content interesting, others quickly scroll through them. \textbf{So, we aim to compare people's interests in posting coronavirus content and the audience's interests in viewing them between different groups.}  Also, with recent developments and policy changes toward COVID-19, it is unclear how people's discussions would react. Some people might believe that the pandemic is starting to end so that discussing it would seem increasingly like an unnecessary effort, while others might find these policy changes controversial and want to voice their opinions even more. Also, even though COVID-related topics are almost always on the news, some news outlets might intentionally cover them more frequently than others. For the people watching the news, some people might find these news reports interesting, while others can't help but switch channels. So, how people's interest in listening or discussing COVID-related topics changes over time is not very clear. \textbf{Our second goal is to analyze how people's interest in COVID-related topics changes and how frequently people have discussed COVID-related issues in the two years since the pandemic started.} 
+
+    \section{Dataset Used}
+    \indent
+
+    \begin{itemize}
+        \item[1.] A wide range of Twitter users: We used twitter's get friends list API \href{https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list}{(documentation)} and the follows-chaining technique to obtain a wide range of twitter users. This technique is explained in the Computational Overview section. Due to rate limiting, we ran the program for one day and obtained 224,619 users (852.3 MB decompressed). However, only the username, popularity, post count, and language data are used, and the processed (filtered) user dataset \C{data/twitter/user/processed/users.json} is only 7.9 MB in total.
+         
+        \item[2.] All tweets from sampled users: We selected two samples of 500 users each (the sampling method is explained in the Computational Overview section), and we used the user-timeline API \href{https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline}{(documentation)} to obtain all of their tweets. Due to rate limiting, the program took around 16 hours to finish, and we obtained 6.07 GB of raw data (uncompressed). During processing, we reduced the data for each tweet to only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory \C{data/twitter/user-tweets/processed} is only 107.9 MB in total.
+        
+        \item[3.] Top 100 news twitter accounts by Bremmen (https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/)
+        \item[4.] COVID-19 daily new cases data by New York Times (https://github.com/nytimes/covid-19-data). 
+    \end{itemize}
+ 
+    \section{Computational Overview}
+
+    \subsection*{Data Gathering}
+    \indent
+
+    However, since twitter limited the request rate of this API endpoint to 1 request ($\le 200$ users) per minute, we ran the program continuously for one day to gather this data.
+
+    We plan to transform different platforms’ user posting data, all with unique formats, into data in a platform-independent data model to store and compare. When processing social media data, we will convert platform-dependent keywords such as \texttt{favorites}, \texttt{retweets}, or \texttt{full\_text} on Twitter and \texttt{content}, \texttt{views}, or \texttt{comments} on Telegram into our unique platform-independent model with keywords such as \texttt{popularity} and \texttt{text}. And we will store all processed data in \textbf{JSON} before analysis. As for the raw data from different social media platforms, we plan to gather Twitter data using the \textbf{Tweepy} library and Telegram channels data using \textbf{python-telegram-bot}. However, unfortunately, there are no known libraries for Wechat Moments. We will try to obtain Wechat data through package capture using pyshark, but that might not be successful.
+
+    For news outlet data, we plan to use \textbf{requests} to obtain raw HTML from different listing sites, extract news articles’ titles, publishers, and publishing dates with \textbf{regex}, and store them using JSON. We will convert different HTML formats from different news publishers’ sites into our platform-independent news model.
+
+    We also use the \textbf{Json5} library to parse configurations and API keys of our data gathering and analysis programs.
+
+    \subsection*{Data Analysis/Visualization}
+    \indent
+
+    We plan to use \textbf{matplotlib} to create data images or \textbf{plotly} to create websites for data visualization. We plan to use \textbf{NumPy} for statistical calculations.
+
+    To identify whether or not some article is about COVID, we currently use a keyword search. However, a keyword search might not be accurate when COVID has became such an essential background to our society (i.e. many articles with the word COVID in them are about something else). We might experiment with training a binary classification model with \textbf{Keras} and \textbf{scikit-learn} to better classify COVID articles. We might also experiment with training autoencoders with vectorized word occurence data in an COVID-related article to find if there are significant categories within COVID articles (i.e. some COVID articles might be about new COVID policies, and others might just be general updates relating to COVID, and this might be an important insight because people's interests in these different types of COVID articles might differ).
+
+    The primary type of graph we will use will be a frequency histogram——an individual or a group of data’s frequency of mentioning COVID-related topics will be graphed against the date from January 1, 2020, to Nov 1, 2021. We will experiment with group sizes and classification methods to find which variables influence the frequency and which don’t. (For example, we will group individuals by popularity and compare between groups to find if popularity impacts the frequency they mention COVID-related topics). We also plan to overlay these charts in comparison to visualize the statistical differences better.
+
+    Another variant of the frequency histogram will be plotted not against the date but against the country’s confirmed cases since people’s emotions of anxiety might be influenced by the growing or decreasing of confirmed cases. We will also graph some data using this variant to find more insights.
+
+    \section{Running Instructions}
+    \indent
+
+    TODO
+
+    \section{Changes to Proposal}
+    \indent
+
+    First, we originally planned to include news reports from separte journal websites in our analysis as well. However, when we gather the data, we found that there is no way to identify the popularity of a news report published on a journal website. So, we decided to gather the tweets of news accounts on Twitter instead, which will also have the benefit of having the same data gathering and analysis process for each news channel.
+
+    Second, we originally planned to compare people's interests in posting COVID-related topics between different platforms because we thought Chinese people don't rely on Twitter as much since Twitter is blocked in China. However, there isn't any publically available Wechat API that we can use for analysis, and Wechat is also more private, with access to someone's postings limited to only their friends. (It is as if everyone on Twitter has a locked account). Therefore, it is impractical to gather data from Wechat. And, for Telegram channels, the postings does not have a like feature, and might not have commenting feature unless the channel host specifically set up for it using a third-party bot. So there isn't a reliable way to obtain popularity data on Telegram as well. So, instead of comparing between platforms, we compared different groups of people on the Twitter platform.
+
+    \section{Discussion}
+    \indent
+
+    TODO
+\end{document}
Author	SHA1	Message	Date
Hykilpikonna	39313d373e	[+] Test py	2021-12-13 00:02:17 -05:00
MstrPikachu	13489fe2b0	Add gridlines	2021-12-12 00:48:27 -05:00
MstrPikachu	46a7497211	Fix typo in report.py	2021-12-11 23:05:20 -05:00
MstrPikachu	81da79c24a	Amend module docstring of twitter_process.py	2021-12-11 22:45:55 -05:00
MstrPikachu	dc91b607b0	Add module docstring to twitter_process.py	2021-12-09 21:36:53 -05:00
MstrPikachu	04a22c89ab	Add module docstring to twitter.py	2021-12-09 20:27:12 -05:00
MstrPikachu	b4c5fc254d	Added module docstring to utils.py Updated filter_days_avg and updated comments in utils.py Fixed typos in twitter_process.py	2021-12-09 19:56:00 -05:00
Hykilpikonna	ab8a685c8c	[+] Deploy script	2021-11-28 17:49:40 -05:00
Hykilpikonna	ac36f9c969	[U] Update wording	2021-11-27 23:17:04 -05:00
Hykilpikonna	c9338f08df	[+] Finalize project	2021-11-27 23:13:56 -05:00
Hykilpikonna	770d4345c4	[+] Debug mode	2021-11-27 21:44:18 -05:00
Hykilpikonna	ecfe76a231	[O] Catch errors in markdown formatting	2021-11-27 21:39:44 -05:00
Hykilpikonna	647c59f13f	[O] Use json enoding	2021-11-27 21:11:16 -05:00
Hykilpikonna	3519fb015e	[S] Smooth scrolling	2021-11-27 21:03:16 -05:00
Hykilpikonna	07f479e0d4	[U] Update report	2021-11-27 17:26:15 -05:00
Hykilpikonna	e79c086f2e	[S] Style H3	2021-11-27 17:17:22 -05:00
Hykilpikonna	ee8fc67bf0	[+] Reformat report	2021-11-27 17:16:53 -05:00
Hykilpikonna	5e5385e6af	[O] Optimize visualization	2021-11-27 17:16:16 -05:00
Hykilpikonna	4b77b3e462	[+] Add demographics	2021-11-27 12:50:05 -05:00
Hykilpikonna	e498509d2e	[U] Update report	2021-11-27 11:24:38 -05:00
Hykilpikonna	0f98b70509	[U] Update code	2021-11-26 22:20:20 -05:00
Hykilpikonna	177b4aefec	[+] Push	2021-11-26 19:13:07 -05:00
Hykilpikonna	4d831eaba0	[+] Update code	2021-11-26 18:43:19 -05:00
Hykilpikonna	7a5fb3b71e	[U] Update formula	2021-11-26 16:20:57 -05:00
Hykilpikonna	54fb07fb6b	[+] Change graphs	2021-11-25 20:46:16 -05:00
Hykilpikonna	ff50af1f1a	[-] Can't color by y-value, give up	2021-11-25 19:53:07 -05:00
Hykilpikonna	3958497bff	[+] Create graph_line_plot()	2021-11-25 19:04:07 -05:00
Hykilpikonna	2f16fe2162	[+] Docstring	2021-11-25 18:48:59 -05:00
Hykilpikonna	d539705bad	[+] Calculate date freq and pop	2021-11-25 18:45:11 -05:00
Hykilpikonna	9976185182	[M] Rename fields, restructure	2021-11-25 18:39:38 -05:00
Hykilpikonna	175b3b615f	[+] Add scipy and dateutil to requirements	2021-11-25 18:39:02 -05:00
Hykilpikonna	d52fc2f200	[+] Division ignoring zeros	2021-11-25 18:17:43 -05:00
Hykilpikonna	987935edd5	[-] Remove path tricks	2021-11-25 17:55:45 -05:00
Hykilpikonna	ecadbfa8d7	[+] Complete mathjax fonts	2021-11-25 17:23:47 -05:00
Hykilpikonna	a031e9b53e	[O] B64 encode markdown	2021-11-25 16:55:28 -05:00
Hykilpikonna	236360b19c	[-] Remove combine tweets, didn't use that	2021-11-25 16:42:26 -05:00
Hykilpikonna	97b6d3603a	[+] Add class docstrings	2021-11-25 16:12:18 -05:00
Hykilpikonna	0d3808ee68	[+] Report: change	2021-11-25 15:52:53 -05:00
Hykilpikonna	a90f53f052	[F] Prevent path traversal attack	2021-11-25 15:51:16 -05:00
Hykilpikonna	d738e9310e	[+] Write report	2021-11-25 15:33:01 -05:00
Hykilpikonna	3b8709c3d8	[+] Proper mathjax fonts	2021-11-25 14:16:52 -05:00
Hykilpikonna	218d6d175d	[+] Add latex support	2021-11-25 14:04:51 -05:00
Hykilpikonna	1cc3ac21fe	[O] Localize js files	2021-11-25 13:56:41 -05:00
Hykilpikonna	73c1947ed8	[+] Add @include-lines	2021-11-25 12:45:26 -05:00
Hykilpikonna	4ac3b94c04	[+] Add Q25 Q75, IQR calculations	2021-11-25 12:41:01 -05:00
Hykilpikonna	9ff41d92b0	[+] Implement @include-cut	2021-11-25 11:56:20 -05:00
Hykilpikonna	82afe91d11	[+] Add frequency stats	2021-11-25 11:34:49 -05:00
Hykilpikonna	98b4d92781	[+] Hold down E to full screen image	2021-11-25 11:27:44 -05:00
Hykilpikonna	0b9838b51b	[+] Make image clickable	2021-11-25 11:12:33 -05:00
Hykilpikonna	ce720cea88	[+] Import jQuery	2021-11-25 11:10:49 -05:00
Hykilpikonna	fdc430cc8e	[S] Make images undraggable	2021-11-25 11:10:31 -05:00
Hykilpikonna	eeda9624d7	[S] Three images side by side	2021-11-25 11:06:53 -05:00
Hykilpikonna	9613d87f13	[+] Code format tables	2021-11-25 00:19:54 -05:00
Hykilpikonna	0940d1442e	[+] Pop stats	2021-11-24 23:56:38 -05:00
Hykilpikonna	fcabf46f43	[+] Graph axvline	2021-11-24 23:42:32 -05:00
Hykilpikonna	99e94f2caa	[+] report_histogram: docstring	2021-11-24 23:32:15 -05:00
Hykilpikonna	f765751e17	[+] Load font, set color	2021-11-24 23:26:19 -05:00
Hykilpikonna	c505565113	Update .gitignore	2021-11-24 23:26:00 -05:00
Hykilpikonna	88abd52239	[F] Fix load sample	2021-11-24 22:30:37 -05:00
Hykilpikonna	4d14aadc44	[O] Combine ignored	2021-11-24 22:20:45 -05:00
Hykilpikonna	baa55cba2a	[O] Separate didn't post tables and histograms	2021-11-24 22:14:29 -05:00
Hykilpikonna	eb6bc88523	[+] Reporter autosave	2021-11-24 22:12:10 -05:00
Hykilpikonna	9527d0cfb8	[M] Split table creating into a separate function	2021-11-24 22:02:24 -05:00
Hykilpikonna	ce825eb227	[S] Style links	2021-11-24 21:53:04 -05:00
Hykilpikonna	011007217a	[+] Add sample description	2021-11-24 21:51:21 -05:00
Hykilpikonna	6c2e59ff66	[O] Restructure file	2021-11-24 21:41:43 -05:00
Hykilpikonna	fdaebf7f52	[U] Update usage	2021-11-24 21:41:18 -05:00
Hykilpikonna	d73aa25bd6	[M] Move Reporter to Utils	2021-11-24 21:38:03 -05:00
Hykilpikonna	cc90af631d	[U] Rename, restructure	2021-11-24 21:34:34 -05:00
Hykilpikonna	3c2ad5462a	[O] Change analysis structure	2021-11-24 21:28:57 -05:00
Hykilpikonna	44f3dcb9d2	[S] Style headers	2021-11-24 21:06:36 -05:00
Hykilpikonna	1f6ede258a	[O] Style webpage	2021-11-24 20:56:47 -05:00
Hykilpikonna	54b6ace414	[O] Docstrings	2021-11-24 20:50:39 -05:00
Hykilpikonna	d8869db409	[+] Create server	2021-11-24 20:49:01 -05:00
Hykilpikonna	0954920b59	[+] Inject markdown	2021-11-24 20:48:54 -05:00
Hykilpikonna	12faec2a15	[+] Install flask	2021-11-24 19:54:20 -05:00
Hykilpikonna	7857f50eb7	[-] Remove markdown dependency, use browser marked instead	2021-11-24 19:53:32 -05:00
Hykilpikonna	411b50e793	[+] Process @include	2021-11-24 19:43:59 -05:00
Hykilpikonna	82207f951b	[O] Use github table format for tabulate	2021-11-24 19:36:08 -05:00
Hykilpikonna	b845762754	[+] Parse markdown	2021-11-24 19:34:50 -05:00
Hykilpikonna	bae16fce53	[+] Create generate_report	2021-11-24 19:34:12 -05:00
Hykilpikonna	192d6d4760	[O] Ignore latex .out file	2021-11-24 19:29:58 -05:00
Hykilpikonna	4abfc80027	[+] Add SRC_DIR	2021-11-24 19:28:36 -05:00
Hykilpikonna	4aaaa3c349	[O] Use file.parent instead of splitting /	2021-11-24 19:28:25 -05:00
Hykilpikonna	b39611c226	[F] Save report first so that directory exist	2021-11-24 18:54:55 -05:00
Hykilpikonna	adcb707481	[F] Fix no report	2021-11-24 18:54:26 -05:00
Hykilpikonna	44b7757e50	[+] Import markdown	2021-11-24 18:32:51 -05:00
Hykilpikonna	d2e2f4adbf	[U] Use reporter	2021-11-24 18:32:45 -05:00
Hykilpikonna	ebeba08c19	[+] Create report file	2021-11-24 18:26:46 -05:00
Hykilpikonna	48eabdef76	[+] Graph covid posts by date	2021-11-24 17:36:03 -05:00
Hykilpikonna	65de4faa59	[O] Split functions	2021-11-24 17:30:26 -05:00
Hykilpikonna	153e9e4ed6	[+] Add precondition	2021-11-24 17:25:04 -05:00
Hykilpikonna	5f07979140	[O] Optimize parse_date even more	2021-11-24 17:24:16 -05:00
Hykilpikonna	d7e3aee0e6	[+] Add runtime test result	2021-11-24 17:21:51 -05:00
Hykilpikonna	e20cce6f0d	[+] Create load_combined_tweets	2021-11-24 17:16:01 -05:00
Hykilpikonna	53339e351c	[+] Implement faster parse_date	2021-11-24 17:14:53 -05:00
Hykilpikonna	1cb1f58e1a	[O] Filter news channels	2021-11-24 17:00:44 -05:00
Hykilpikonna	c41931d3eb	[+] Combine tweets for sample	2021-11-24 16:43:58 -05:00
Hykilpikonna	eabeb08de3	[+] Output statistics	2021-11-24 16:25:28 -05:00
Hykilpikonna	00aa0a8674	[-] Remove debug print	2021-11-24 16:03:31 -05:00
Hykilpikonna	acd053c303	[O] Reduce bin size	2021-11-24 16:02:49 -05:00
Hykilpikonna	94005cc9d9	[+] Remove outliers	2021-11-24 16:01:46 -05:00
Hykilpikonna	a3f5fc4fc0	[F] Return list	2021-11-24 15:58:38 -05:00
Hykilpikonna	91e028eba0	[+] Create function to remove outliers	2021-11-24 15:57:28 -05:00
Hykilpikonna	ea27d0fec2	[+] Visualize covid tweets popularity ratio	2021-11-24 15:44:08 -05:00
Hykilpikonna	e5cee26d83	[+] Download twitter news channel tweets	2021-11-24 15:23:13 -05:00
Hykilpikonna	275efda0fe	[+] Add news channels from memeburn	2021-11-24 15:22:54 -05:00
Hykilpikonna	f9370aedb5	[O] Ensure all files are lowercased	2021-11-24 11:55:19 -05:00
Hykilpikonna	2548c26ecc	[+] Import requests	2021-11-24 11:55:07 -05:00
Hykilpikonna	4be1f9d9aa	[+] Import beautifulsoup4	2021-11-24 11:40:38 -05:00
Hykilpikonna	ca05ca7abf	[-] Remove pytz	2021-11-24 11:34:06 -05:00
Hykilpikonna	81c43dc06e	[F] Fix load user sample	2021-11-24 11:22:58 -05:00
Hykilpikonna	4e84de3fca	[+] Add news channels to sample	2021-11-24 11:19:59 -05:00
Hykilpikonna	dfe4410e2b	[+] Create function to find news channels	2021-11-24 11:17:05 -05:00
Hykilpikonna	eda17f2ad0	[M] Move constants to constants.py	2021-11-24 10:58:15 -05:00
Hykilpikonna	815eab8cf1	[F] Format constants	2021-11-24 10:31:28 -05:00
Hykilpikonna	df02c0ba51	[+] Add keywords	2021-11-24 10:27:22 -05:00
Hykilpikonna	bba3858e8c	[+] Add constants	2021-11-24 10:25:08 -05:00
Hykilpikonna	41b7df7090	[O] Make directories constant	2021-11-24 10:24:51 -05:00
Hykilpikonna	20e9805c6c	[+] Add docstring	2021-11-24 10:15:10 -05:00
Hykilpikonna	698967a29b	[-] Remove twitter-individual.py	2021-11-24 10:12:54 -05:00
Hykilpikonna	a807055d3d	[U] Update report	2021-11-23 22:24:21 -05:00
Hykilpikonna	28c9fcf2be	[+] Add more keywords	2021-11-23 22:07:11 -05:00
Hykilpikonna	988e0cbb4b	[+] Create function to pack data	2021-11-23 21:38:47 -05:00
Hykilpikonna	ecd28f4c46	[+] Add docstring	2021-11-23 21:19:30 -05:00
Hykilpikonna	464ab9502e	[+] Create visualization script	2021-11-23 21:14:17 -05:00
Hykilpikonna	c91ac3433d	[+] Create function to load tweet of a user	2021-11-23 20:34:48 -05:00
Hykilpikonna	65df10e3c2	[+] Add matplotlib	2021-11-23 20:30:19 -05:00
Hykilpikonna	4e265bf30d	[O] Clean code	2021-11-23 19:51:58 -05:00
Hykilpikonna	b81246e9f3	Revert "[+] Script to remove tweets not in sample" This reverts commit `f64dd2d95f`.	2021-11-23 19:51:44 -05:00
Hykilpikonna	f64dd2d95f	[+] Script to remove tweets not in sample	2021-11-23 19:51:36 -05:00
Hykilpikonna	1d5b38d45f	[O] Check file exists when generating sample	2021-11-23 19:33:26 -05:00
Hykilpikonna	e6dd8a17a5	[+] Project report: Describe dataset	2021-11-23 14:40:57 -05:00
Hykilpikonna	e51d681479	[+] Add Chinese and Japanese keywords	2021-11-23 14:40:00 -05:00
Hykilpikonna	1cd7a5ffd0	[F] Fix null in filtering by language	2021-11-23 12:11:58 -05:00
Hykilpikonna	b6d0cda387	[+] Filter by language, reselect sample	2021-11-23 12:10:28 -05:00
Hykilpikonna	3333c5377b	[O] Rename fields	2021-11-23 12:05:43 -05:00
Hykilpikonna	b17df5dfa2	[+] Add language field to processed users	2021-11-23 12:02:29 -05:00
Hykilpikonna	b2137e3bf1	[+] Give detailed explanation for named tuple	2021-11-23 11:41:34 -05:00
Hykilpikonna	b5b0088c70	[U] Update main	2021-11-23 11:31:29 -05:00
Hykilpikonna	461f83918c	[+] Load user sample	2021-11-23 11:25:21 -05:00
Hykilpikonna	04a2c0aea9	[+] Create function that creates a sample	2021-11-23 11:16:00 -05:00
Hykilpikonna	47255e9c46	[+] Add num postings data to processed user	2021-11-23 11:06:35 -05:00
Hykilpikonna	1efe0eface	[-] Remove twitter_random_individuals file	2021-11-22 16:42:28 -05:00
Hykilpikonna	5adb2b17b5	[+] Create get user popularity ranking	2021-11-22 16:41:54 -05:00
Hykilpikonna	818c466a1e	[F] Fix no tweets case	2021-11-22 16:24:58 -05:00
Hykilpikonna	d7ebb9580b	[O] Only get 500 top accounts	2021-11-22 16:24:45 -05:00
Hykilpikonna	0aa4db3718	[F] Fix covid detection	2021-11-22 15:34:59 -05:00
Hykilpikonna	d9a78baa3d	[O] Default to indent none to save space	2021-11-22 15:22:09 -05:00
Hykilpikonna	1c6629e504	[+] Add docstring	2021-11-22 14:35:23 -05:00
Hykilpikonna	3643800f8e	[O] Process only if not processed	2021-11-22 14:31:14 -05:00
Hykilpikonna	6995481d0e	[+] Process tweets in main	2021-11-22 14:28:47 -05:00
Hykilpikonna	d16032aa71	[+] Process tweets	2021-11-22 14:28:36 -05:00
Hykilpikonna	0dc0688273	[F] Fix unauthorized	2021-11-22 14:24:22 -05:00
Hykilpikonna	acfb397c9c	[+] Create function to check if a post is covid-related	2021-11-22 14:11:02 -05:00
Hykilpikonna	9e27a3c725	[O] Use write function in data processing	2021-11-22 14:00:28 -05:00
Hykilpikonna	a73a792189	[O] Optimize imports	2021-11-22 13:59:55 -05:00
Hykilpikonna	4c83dd07a2	[O] Use read and write functions	2021-11-22 13:55:18 -05:00
Hykilpikonna	343e432df9	[+] Create helper functions for reading and writing files	2021-11-22 13:55:03 -05:00
Hykilpikonna	0186aa185f	[+] Step C2 of data collection	2021-11-22 12:36:20 -05:00
Hykilpikonna	eacbe6b488	[+] Check exists	2021-11-22 12:35:11 -05:00
Hykilpikonna	a6ac2f15ed	[O] Optimize debug messages	2021-11-22 12:30:39 -05:00
Hykilpikonna	05c10bf19a	[-] Remove custom tweet model, optimize imports	2021-11-22 12:27:28 -05:00
Hykilpikonna	b5c02ef702	[+] Implement download all tweets	2021-11-22 12:20:14 -05:00
Hykilpikonna	4099b2d4fd	[O] Make rate limit static	2021-11-22 11:51:28 -05:00
Hykilpikonna	a95374302a	[+] Create main	2021-11-22 11:33:08 -05:00
Hykilpikonna	12327d6aab	[O] Use named tuple instead of data class	2021-11-22 11:32:58 -05:00
Hykilpikonna	14d3d84ed8	[F] Fix more pyta warnings	2021-11-22 10:57:25 -05:00
Hykilpikonna	8049a3604d	[F] Fix name shadowing	2021-11-22 10:49:37 -05:00
Hykilpikonna	29270cff70	[F] Fix pyta errors	2021-11-22 10:48:00 -05:00
Hykilpikonna	b3271a5c72	[+] Add python-ta in requirements	2021-11-22 10:43:23 -05:00
Hykilpikonna	44831c6b30	[+] Add caller info to debug	2021-11-22 10:36:26 -05:00
Hykilpikonna	46b8f80b70	[F] Fix load_users_popularity	2021-11-22 10:31:47 -05:00
Hykilpikonna	38c7a71860	[F] Fix imports	2021-11-22 10:24:43 -05:00
Hykilpikonna	68d61757f6	[O] Use generalusers class	2021-11-22 10:20:37 -05:00
Hykilpikonna	01235a003c	[+] Create function to load users by popularity	2021-11-22 10:19:25 -05:00
Hykilpikonna	d60d74def5	[O] Separate normalize_directory function	2021-11-22 10:14:56 -05:00
Hykilpikonna	77e0176dbc	[M] Move data classes from utils to twitter_process	2021-11-22 10:09:03 -05:00
Hykilpikonna	991fef7e13	[+] Add twitter user data descripition	2021-11-22 09:56:29 -05:00
Hykilpikonna	7c421730df	[+] Create project report	2021-11-22 09:48:35 -05:00
Hykilpikonna	5220b9e2a1	[M] Move proposal to writing folder	2021-11-22 09:30:15 -05:00
Hykilpikonna	e4b8bc3b20	[F] Fix imports	2021-11-22 09:27:27 -05:00
Hykilpikonna	a7acdf9062	[O] Reformat docstring	2021-11-22 09:27:06 -05:00
Hykilpikonna	028003e838	[+] Add comments in requirements	2021-11-22 09:14:10 -05:00
Hykilpikonna	7e19250fe9	[O] Handle too many requests exception	2021-11-22 00:36:33 -05:00
Hykilpikonna	b55b4f3e4c	Revert "[-] Remove testing script" This reverts commit `963256ce16`.	2021-11-22 00:24:56 -05:00
Hykilpikonna	459e1c6912	[+] Display requests per minute	2021-11-22 00:24:12 -05:00
Hykilpikonna	794008b182	[O] Make indent customizable	2021-11-22 00:12:22 -05:00
Hykilpikonna	248ce6c7c8	[O] Use dumps for meta	2021-11-22 00:07:57 -05:00
Hykilpikonna	f0286e0b13	[O] Print metainfo	2021-11-22 00:05:25 -05:00
Hykilpikonna	428ead35ef	[O] Implement resume	2021-11-21 23:53:16 -05:00
Hykilpikonna	51366074bf	[O] Separate resume function	2021-11-21 23:44:22 -05:00
Hykilpikonna	b2edfc2003	[O] Move rate delay to the end	2021-11-21 23:34:22 -05:00
Hykilpikonna	cbff127611	[-] Remove protected users	2021-11-21 23:34:10 -05:00
Hykilpikonna	053e7ad503	[F] Oops, the rate limit is 15 per 15 minutes, not 15 per minute	2021-11-21 23:32:19 -05:00
Hykilpikonna	b4ca5f65af	[F] Fix rate limit	2021-11-21 23:22:34 -05:00
Hykilpikonna	ad3868406b	[F] Fix "TypeError: unhashable type: 'set'"	2021-11-21 23:20:40 -05:00
Hykilpikonna	5e2f0b7234	[O] Use json.stringify instead of json.dumps	2021-11-21 23:18:42 -05:00
Hykilpikonna	236b1a352c	[+] Mark src directroy	2021-11-21 23:17:45 -05:00
Hykilpikonna	e95253d5ca	[+] Ignore data files	2021-11-21 23:17:35 -05:00
Hykilpikonna	23cbab046c	[+] Support stringify sets	2021-11-21 23:15:54 -05:00
Hykilpikonna	f81643aca5	[F] mkdirs	2021-11-21 23:13:29 -05:00
Hykilpikonna	e59ba314d0	[F] Fix rate limit	2021-11-21 23:10:49 -05:00
Hykilpikonna	963256ce16	[-] Remove testing script	2021-11-21 23:09:34 -05:00
Hykilpikonna	032c7caffd	[U] Update requirements	2021-11-21 23:09:10 -05:00
Hykilpikonna	30aee55a7d	[O] Make config path confiugrable	2021-11-21 23:07:13 -05:00
Hykilpikonna	b3e7e3ee38	[F] Fix imports	2021-11-21 23:05:38 -05:00
Hykilpikonna	2be16d2dfb	[+] Forgot to add api as a param	2021-11-21 23:05:06 -05:00
Hykilpikonna	f1a4c95fae	[-] Remove get_user_following_data	2021-11-21 23:03:49 -05:00
Hykilpikonna	44ef88c420	[+] Implement rate limit	2021-11-21 23:03:10 -05:00
Hykilpikonna	181a89688b	[+] Write meta info	2021-11-21 22:52:33 -05:00
Hykilpikonna	30a78e6b0e	[+] Implement friends-chain	2021-11-21 22:46:25 -05:00
Hykilpikonna	f3760f6c6b	[+] Create download_users function signature	2021-11-21 22:13:16 -05:00
Hykilpikonna	907b464d3f	[M] Move code to src	2021-11-21 22:03:12 -05:00
Hykilpikonna	c5b0aba8cf	[M] Change the folder name collect to raw_collect	2021-11-21 22:02:37 -05:00
Hykilpikonna	ba7fb3919b	[O] Allow non-ascii in json	2021-11-21 21:34:43 -05:00
Hykilpikonna	523dc813a5	[U] Update bib	2021-11-05 00:57:29 -04:00
Hykilpikonna	2a88ff8bf7	[+] Add some ambitious goals	2021-11-05 00:42:22 -04:00
Hykilpikonna	91ca53147c	[+] Add voxdotcom sample data	2021-11-05 00:42:12 -04:00
Hykilpikonna	68985c67fb	[F] Correctly fix repost detection	2021-11-05 00:23:08 -04:00
Hykilpikonna	4ff72a47d7	[O] Optimize network usage by trimming username	2021-11-05 00:19:52 -04:00
Hykilpikonna	7868b1ac07	[F] Fix retweet detection	2021-11-05 00:19:24 -04:00
Hykilpikonna	6383605491	[+] Convert tweets to generic format	2021-11-05 00:15:46 -04:00
Hykilpikonna	0751324fd6	[+] Create json encoder that supports dataclasses and datetime	2021-11-05 00:15:31 -04:00
Hykilpikonna	39559bbf3a	[-] Ignore config file	2021-11-04 23:53:59 -04:00
Hykilpikonna	407a20abd7	[+] Add more ignores	2021-11-04 23:48:29 -04:00
MstrPikachu	7c81ac1365	Create references.bib	2021-11-04 21:55:31 -04:00
MstrPikachu	d766732d9b	Add references to project_proposal.tex	2021-11-04 21:54:41 -04:00
Hykilpikonna	059ddcbb1b	[+] Italic and bold	2021-11-03 19:14:40 -04:00
Hykilpikonna	f059764c8a	[S] Indent after section	2021-11-03 19:10:50 -04:00
Hykilpikonna	b7e53748f7	[+] Ignore latex intermediate files	2021-11-03 19:10:33 -04:00
Hykilpikonna	9cc7fd8968	[+] Draft	2021-11-03 19:09:11 -04:00
Hykilpikonna	0cc1583908	[+] Add project proposal	2021-11-03 19:00:45 -04:00
Hykilpikonna	3e74ee84b1	[+] Test download tweets	2021-11-03 18:59:28 -04:00
Hykilpikonna	2086c5199d	[+] Download tweets function	2021-11-03 18:59:15 -04:00
Hykilpikonna	dcb912fae1	[F] Fix max_id	2021-11-03 16:51:39 -04:00
Hykilpikonna	880805c1ff	[+] Add date param to posting class	2021-11-03 16:47:08 -04:00
Hykilpikonna	78a2b91b9d	[O] Optimize imports	2021-11-03 16:46:29 -04:00
Hykilpikonna	5a42bded0f	[+] Create debug function	2021-11-03 16:45:54 -04:00
Hykilpikonna	879aa3e5ef	[+] Create platform-independent user and posting classes	2021-11-03 16:45:46 -04:00
Hykilpikonna	37bbe4f5b0	[+] Get all tweets from a specific user in a loop	2021-11-03 16:45:30 -04:00
Hykilpikonna	ae885317a1	[+] Create function to get user tweets	2021-11-03 16:33:54 -04:00
Hykilpikonna	05e511c743	[+] Create user and tweet dataclasses	2021-11-03 16:22:20 -04:00
Hykilpikonna	af203e4c2f	[+] Create separate twitter file	2021-11-03 16:22:07 -04:00
Hykilpikonna	1af5182f91	[+] Get tweets from inidivdual	2021-11-03 16:00:05 -04:00
Hykilpikonna	00be8d7d40	[U] Update usage in tweepy login	2021-11-03 15:59:44 -04:00
Hykilpikonna	9ff31adbcd	[+] Parse config data class	2021-11-03 15:59:31 -04:00
Hykilpikonna	53fe18049b	[+] Create config data class	2021-11-03 15:59:19 -04:00
Hykilpikonna	db6c5328ca	[+] Create function for tweepy login	2021-11-03 15:42:18 -04:00
Hykilpikonna	102cad055e	[M] Move create_config to a separate file	2021-11-03 15:36:42 -04:00
Hykilpikonna	c248e500a2	[+] Create load config function	2021-11-03 15:34:32 -04:00
Hykilpikonna	ad09a7fd04	Initial commit	2021-11-03 15:30:38 -04:00