From fa0583007a1ddcbc827983ca5d18187eaded6435 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:08:18 -0500
Subject: [PATCH 01/11] [+] Module docstrings and preconditions

---
 src/collect_others.py |  3 +++
 src/constants.py      | 10 +++++++---
 src/main.py           |  4 ++++
 src/processing.py     |  4 ++--
 src/report.py         |  3 +++
 src/utils.py          | 26 +++++++++++++++++++++++---
 src/visualization.py  |  2 +-
 7 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/collect_others.py b/src/collect_others.py
index 14600be..3af7d8d 100644
--- a/src/collect_others.py
+++ b/src/collect_others.py
@@ -1,3 +1,6 @@
+"""
+This module uses web requests to collect and process other data we are using in our analysis.
+"""
 from dataclasses import dataclass
 
 import requests
diff --git a/src/constants.py b/src/constants.py
index 757e82a..cef8dcb 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -1,6 +1,10 @@
-# Constants (The instructors said that we can use global constants here:
-# https://piazza.com/class/ksovzjrlsye72f?cid=1664
-# They should not end with "/"
+"""
+This module stores constant variables in our projects.
+
+Instructors said that we can use global constants: https://piazza.com/class/ksovzjrlsye72f?cid=1664
+"""
+
+# Paths, should not end with "/"
 DATA_DIR = '../data'
 TWEETS_DIR = f'{DATA_DIR}/twitter/user-tweets'
 USER_DIR = f'{DATA_DIR}/twitter/user'
diff --git a/src/main.py b/src/main.py
index 4bb6b77..db52016 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,3 +1,7 @@
+"""
+This module is the main module of our program which runs different functions in different modules
+by steps.
+"""
 from visualization import *
 from collect_twitter import *
 from report import serve_report
diff --git a/src/processing.py b/src/processing.py
index 70eaaea..cd16ad1 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -162,7 +162,7 @@ def get_english_news_channels() -> list[str]:
 
     Run this after download_all_tweets(api, 'TwitterNews')
 
-    Precondition:
+    Preconditions:
       - <tweets_dir>/user/TwitterNews.json exists.
 
     :return: A list of news channel screen names
@@ -193,7 +193,7 @@ def filter_news_channels() -> None:
     """
     Filter out news channels that don't exist anymore or have been banned by Twitter.
 
-    Precondition:
+    Preconditions:
       - Run this after downloading all tweets from the news channels in Step 2.3 in main.
 
     :return: None
diff --git a/src/report.py b/src/report.py
index 072d279..1d37fbc 100644
--- a/src/report.py
+++ b/src/report.py
@@ -1,3 +1,6 @@
+"""
+This module generates report HTML and serves it in an HTTP server.
+"""
 import json
 import os.path
 import shutil
diff --git a/src/utils.py b/src/utils.py
index 4ee4bec..5b21fc3 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -3,7 +3,8 @@
 - file I/O
 - statistics functions, removing outliers and averaging values over a period
 - date-related functions
-- classes for configs, reports, statistics, and JSON"""
+- classes for configs, reports, statistics, and JSON
+"""
 
 import dataclasses
 import inspect
@@ -87,6 +88,9 @@ def write(file: str, text: str) -> None:
     """
     Write text to a file
 
+    Preconditions:
+      - file != ''
+
     :param file: File path (will be converted to lowercase)
     :param text: Text
     :return: None
@@ -104,6 +108,9 @@ def read(file: str) -> str:
     """
     Read file content
 
+    Preconditions:
+      - file != ''
+
     :param file: File path (will be converted to lowercase)
     :return: None
     """
@@ -168,6 +175,9 @@ def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float
 
     Credit to: https://stackoverflow.com/a/11886564/7346633
 
+    Preconditions:
+      - len(points) > 0
+
     :param points: Input points list
     :param z_threshold: Z threshold for identifying whether or not a point is an outlier
     :return: List with outliers removed
@@ -212,6 +222,9 @@ def get_statistics(points: list[float]) -> Stats:
     """
     Calculate statistics for a set of points
 
+    Preconditions:
+      - len(points) > 0
+
     :param points: Input points
     :return: Statistics
     """
@@ -262,7 +275,7 @@ def parse_date_only(iso: str) -> datetime:
     Parse date faster.
 
     Preconditions:
-      - iso is in the format of "YYYY-MM-DD" (e.g. "2021-10-20")
+      - iso starts with the format of "YYYY-MM-DD" (e.g. "2021-10-20" or "2021-10-20T10:04:14")
       - iso is a valid date (this function does not check for the validity of the input)
 
     :param iso: Input date
@@ -275,6 +288,10 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
     """
     Date range for looping, excluding the end date
 
+    Preconditions:
+      - start_date starts with the "YYYY-MM-DD" format
+      - end_date starts with the "YYYY-MM-DD" format
+
     :param start_date: Start date in "YYYY-MM-DD" format
     :param end_date: End date in "YYYY-MM-DD" format
     :return: Generator for looping through the dates one day at a time.
@@ -307,7 +324,7 @@ def filter_days_avg(y: list[float], n: int) -> list[float]:
     """
     Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
 
-    Precondition:
+    Preconditions:
       - n % 2 == 1
       - len(y) > 0
 
@@ -391,6 +408,9 @@ def json_stringify(obj, indent: Union[int, None] = None) -> str:
     Serialize json string with support for dataclasses and datetime and sets and with custom
     configuration.
 
+    Preconditions:
+      - obj != None
+
     :param obj: Objects
     :param indent: Indent size or none
     :return: Json strings
diff --git a/src/visualization.py b/src/visualization.py
index 426f48c..5e394b9 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -87,7 +87,7 @@ class Sample:
         To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't
         post at all.
 
-        Precondition:
+        Preconditions:
           - Downloaded tweets data are sorted by date
         """
         debug(f'Calculating sample tweets data for {self.name}...')

From 8e2550097f0f4c04591582196865e1ee6b2c178d Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:12:06 -0500
Subject: [PATCH 02/11] [O] Fix precondition indent

---
 src/processing.py    |  4 +--
 src/utils.py         | 62 ++++++++++++++++++++++----------------------
 src/visualization.py | 15 +++--------
 3 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/src/processing.py b/src/processing.py
index cd16ad1..288010d 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -163,7 +163,7 @@ def get_english_news_channels() -> list[str]:
     Run this after download_all_tweets(api, 'TwitterNews')
 
     Preconditions:
-      - <tweets_dir>/user/TwitterNews.json exists.
+        - <tweets_dir>/user/TwitterNews.json exists.
 
     :return: A list of news channel screen names
     """
@@ -194,7 +194,7 @@ def filter_news_channels() -> None:
     Filter out news channels that don't exist anymore or have been banned by Twitter.
 
     Preconditions:
-      - Run this after downloading all tweets from the news channels in Step 2.3 in main.
+        - Run this after downloading all tweets from the news channels in Step 2.3 in main.
 
     :return: None
     """
diff --git a/src/utils.py b/src/utils.py
index 5b21fc3..29f0b05 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -29,16 +29,16 @@ class Config:
     Secrets configuration for this program.
 
     Attributes:
-      - consumer_key: The consumer key from the Twitter application portal
-      - consumer_secret: The consumer secret from the Twitter application portal
-      - access_token: The access token of an app from the Twitter application portal
-      - access_secret: The access secret of an app from the Twitter application portal
+        - consumer_key: The consumer key from the Twitter application portal
+        - consumer_secret: The consumer secret from the Twitter application portal
+        - access_token: The access token of an app from the Twitter application portal
+        - access_secret: The access secret of an app from the Twitter application portal
 
     Representation Invariants:
-      - self.consumer_key != ''
-      - self.consumer_secret != ''
-      - self.access_token != ''
-      - self.access_secret != ''
+        - self.consumer_key != ''
+        - self.consumer_secret != ''
+        - self.access_token != ''
+        - self.access_secret != ''
     """
     # Twitter's official API v1 keys
     consumer_key: str
@@ -89,7 +89,7 @@ def write(file: str, text: str) -> None:
     Write text to a file
 
     Preconditions:
-      - file != ''
+        - file != ''
 
     :param file: File path (will be converted to lowercase)
     :param text: Text
@@ -109,7 +109,7 @@ def read(file: str) -> str:
     Read file content
 
     Preconditions:
-      - file != ''
+        - file != ''
 
     :param file: File path (will be converted to lowercase)
     :return: None
@@ -123,8 +123,8 @@ class Reporter:
     Report file creator
 
     Attributes:
-      - report: The string of the report
-      - file: Where the report is stored
+        - report: The string of the report
+        - file: Where the report is stored
 
     Representation Invariants:
         - self.file != ''
@@ -176,7 +176,7 @@ def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float
     Credit to: https://stackoverflow.com/a/11886564/7346633
 
     Preconditions:
-      - len(points) > 0
+        - len(points) > 0
 
     :param points: Input points list
     :param z_threshold: Z threshold for identifying whether or not a point is an outlier
@@ -203,12 +203,12 @@ class Stats:
     Data class storing the statistics of a sample
 
     Attributes:
-      - mean: The average of the sample
-      - stddev: The standard deviation
-      - median: The median value of the sample, or the 50th percentile
-      - iqr: The interquartile-range (75th percentile - 25th percentile)
-      - q25: The first quartile, or the 25th percentile
-      - q75: The third quartile, or the 75th percentile
+        - mean: The average of the sample
+        - stddev: The standard deviation
+        - median: The median value of the sample, or the 50th percentile
+        - iqr: The interquartile-range (75th percentile - 25th percentile)
+        - q25: The first quartile, or the 25th percentile
+        - q75: The third quartile, or the 75th percentile
     """
     mean: float
     stddev: float
@@ -223,7 +223,7 @@ def get_statistics(points: list[float]) -> Stats:
     Calculate statistics for a set of points
 
     Preconditions:
-      - len(points) > 0
+        - len(points) > 0
 
     :param points: Input points
     :return: Statistics
@@ -260,8 +260,8 @@ def parse_date_time(iso: str) -> datetime:
     python's built-in dateutil.parser.isoparse() function.
 
     Preconditions:
-      - iso is the output of datetime.isoformat() (In a format like "2021-10-20T23:50:14")
-      - iso is a valid date (this function does not check for the validity of the input)
+        - iso is the output of datetime.isoformat() (In a format like "2021-10-20T23:50:14")
+        - iso is a valid date (this function does not check for the validity of the input)
 
     :param iso: Input date
     :return: Datetime object
@@ -275,8 +275,8 @@ def parse_date_only(iso: str) -> datetime:
     Parse date faster.
 
     Preconditions:
-      - iso starts with the format of "YYYY-MM-DD" (e.g. "2021-10-20" or "2021-10-20T10:04:14")
-      - iso is a valid date (this function does not check for the validity of the input)
+        - iso starts with the format of "YYYY-MM-DD" (e.g. "2021-10-20" or "2021-10-20T10:04:14")
+        - iso is a valid date (this function does not check for the validity of the input)
 
     :param iso: Input date
     :return: Datetime object
@@ -289,8 +289,8 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
     Date range for looping, excluding the end date
 
     Preconditions:
-      - start_date starts with the "YYYY-MM-DD" format
-      - end_date starts with the "YYYY-MM-DD" format
+        - start_date starts with the "YYYY-MM-DD" format
+        - end_date starts with the "YYYY-MM-DD" format
 
     :param start_date: Start date in "YYYY-MM-DD" format
     :param end_date: End date in "YYYY-MM-DD" format
@@ -310,7 +310,7 @@ def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
     used instead.
 
     Preconditions:
-      - The date in dates must be in the same format as the dates in the keys of y
+        - The date in dates must be in the same format as the dates in the keys of y
 
     :param y: Y axis data (in the format y[date] = value)
     :param dates: Dates
@@ -325,8 +325,8 @@ def filter_days_avg(y: list[float], n: int) -> list[float]:
     Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
 
     Preconditions:
-      - n % 2 == 1
-      - len(y) > 0
+        - n % 2 == 1
+        - len(y) > 0
 
     :param y: Values
     :param n: Number of days, must be odd
@@ -362,7 +362,7 @@ def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float
     Divide two lists of floats, ignoring zeros (anything dividing by zero will produce zero)
 
     Preconditions:
-      - len(numerator) == len(denominator)
+        - len(numerator) == len(denominator)
 
     :param numerator: Numerator
     :param denominator: Denominator
@@ -409,7 +409,7 @@ def json_stringify(obj, indent: Union[int, None] = None) -> str:
     configuration.
 
     Preconditions:
-      - obj != None
+        - obj != None
 
     :param obj: Objects
     :param indent: Indent size or none
diff --git a/src/visualization.py b/src/visualization.py
index 5e394b9..69419d5 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -510,6 +510,9 @@ def report_change_graphs(sample: Sample) -> None:
 def report_all() -> None:
     """
     Generate all reports
+    
+    Preconditions:
+        - Report has been 
     """
     graph_load_font()
 
@@ -536,15 +539,3 @@ def report_all() -> None:
     graph_line_plot(samples[0].dates, [s.date_freqs for s in samples], 'change/comb/freq.png',
                     'COVID-posting frequency over time for all samples - IIR(10)', True, 10,
                     labels=[s.name for s in samples])
-
-
-if __name__ == '__main__':
-    report_all()
-    # samples = load_user_sample()
-    # combine_tweets_for_sample([u.username for u in samples.most_popular], '500-pop')
-    # combine_tweets_for_sample([u.username for u in samples.random], '500-rand')
-    # combine_tweets_for_sample(samples.english_news, 'eng-news')
-
-    # tweets = load_combined_tweets('500-pop')
-    # print(len(tweets))
-    # view_covid_tweets_date(tweets)

From 87eaa28794a7040fcf68f2b0bcf382f9d65f0c4a Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:20:14 -0500
Subject: [PATCH 03/11] [O] Reformat code, expand imports

---
 src/collect_others.py  |  7 +++++--
 src/collect_twitter.py | 21 ++++++---------------
 src/main.py            | 13 +++++--------
 src/processing.py      |  9 ++++++---
 src/report.py          |  1 +
 src/utils.py           | 10 +++++++---
 src/visualization.py   | 26 +++++++++++++++++---------
 7 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/src/collect_others.py b/src/collect_others.py
index 3af7d8d..e40d439 100644
--- a/src/collect_others.py
+++ b/src/collect_others.py
@@ -1,6 +1,7 @@
 """
 This module uses web requests to collect and process other data we are using in our analysis.
 """
+
 from dataclasses import dataclass
 
 import requests
@@ -12,12 +13,14 @@ class CasesData:
     A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths
     on that day.
 
+    Attributes:
+        - cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+        - deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date
+
     Representation Invariants:
         - all(x >= 0 for x in self.cases.values())
         - all(x >= 0 for x in self.deaths.values())
-
     """
-    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
     cases: dict[str, float]
     deaths: dict[str, float]
 
diff --git a/src/collect_twitter.py b/src/collect_twitter.py
index 1589c3b..c5385b3 100644
--- a/src/collect_twitter.py
+++ b/src/collect_twitter.py
@@ -4,16 +4,18 @@ It contains functions related scraping users/tweets, including:
 - getting the tweets of a user
 - downloading many users by checking their followers and follower's followers, etc.
 """
+import json
 import math
+import os
 import random
 import time
-from typing import List
+from typing import List, Union
 
 import tweepy
 from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound
 
 from constants import TWEETS_DIR, USER_DIR
-from utils import *
+from utils import Config, debug, calculate_rate_delay, write, json_stringify, read
 
 
 def tweepy_login(conf: Config) -> tweepy.API:
@@ -57,7 +59,8 @@ def download_all_tweets(api: API, screen_name: str,
     Twitter API Reference
     --------
     It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
-    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
+    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get
+    -statuses-user_timeline)
     This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
     limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
     safe, this function uses a rate limit of 60 rpm.
@@ -294,15 +297,3 @@ def download_users_execute(api: API, n: float,
 
         # Rate limit
         time.sleep(rate_delay)
-
-
-if __name__ == '__main__':
-    # python_ta.check_all(config={
-    #     'max-line-length': 100,
-    #     'disable': ['R1705', 'C0200', 'E9998', 'E9999']
-    # })
-
-    config = load_config('config.json5')
-    tweepy_api = tweepy_login(config)
-    # download_users_start(tweepy_api, 'sauricat')
-    download_users_resume_progress(tweepy_api)
diff --git a/src/main.py b/src/main.py
index db52016..a71ae6f 100644
--- a/src/main.py
+++ b/src/main.py
@@ -2,11 +2,12 @@
 This module is the main module of our program which runs different functions in different modules
 by steps.
 """
-from visualization import *
-from collect_twitter import *
-from report import serve_report
-from utils import *
 
+from collect_twitter import *
+from processing import *
+from report import *
+from utils import *
+from visualization import *
 
 if __name__ == '__main__':
     # Load config and create API
@@ -41,10 +42,6 @@ if __name__ == '__main__':
     # criteria as our sample, also find news channels
     # select_user_sample()
 
-    # Just curious, who are the 20 most popular individuals on twitter?
-    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
-    #                headers=['Name', 'Followers']))
-
     #####################
     # Data collection - Step C2.1
     # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
diff --git a/src/processing.py b/src/processing.py
index 288010d..cc237c9 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -2,17 +2,20 @@
 Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
 users, creating samples of users, filtering news channels, and processing tweets for file storage.
 """
+import json
+import os
 import random
-from typing import NamedTuple
 from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import NamedTuple
 
-import dateutil.parser
 import requests
 from bs4 import BeautifulSoup
 from py7zr import SevenZipFile
 
 from constants import DATA_DIR, TWEETS_DIR, USER_DIR
-from utils import *
+from utils import read, debug, write, json_stringify
 
 
 class ProcessedUser(NamedTuple):
diff --git a/src/report.py b/src/report.py
index 1d37fbc..242244a 100644
--- a/src/report.py
+++ b/src/report.py
@@ -1,6 +1,7 @@
 """
 This module generates report HTML and serves it in an HTTP server.
 """
+
 import json
 import os.path
 import shutil
diff --git a/src/utils.py b/src/utils.py
index 29f0b05..feaeda4 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,4 +1,5 @@
-"""This module contains useful functions and classes, including:
+"""
+This module contains useful functions and classes, including:
 - debug messages
 - file I/O
 - statistics functions, removing outliers and averaging values over a period
@@ -14,7 +15,7 @@ import statistics
 from dataclasses import dataclass
 from datetime import datetime, date, timedelta
 from pathlib import Path
-from typing import Union, NamedTuple, Any, Generator
+from typing import Union, Any, Generator
 
 import json5
 import numpy as np
@@ -49,7 +50,8 @@ class Config:
 
 def load_config(path: str = 'config.json5') -> Config:
     """
-    Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.
+    Load config using JSON5, from either the local file ~/config.json5 or from the environment
+    variable named config.
 
     :param path: Path of the config file (Default: config.json5)
     :return: Config object
@@ -242,6 +244,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
     :param percent: Whether the numbers are percentages
     :return: Table for tabulate
     """
+
     def num(n: float) -> str:
         return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
 
@@ -384,6 +387,7 @@ class EnhancedJSONEncoder(json.JSONEncoder):
     An improvement to the json.JSONEncoder class, which supports:
     encoding for dataclasses, encoding for datetime, and sets
     """
+
     def default(self, o):
 
         # Support encoding dataclasses
diff --git a/src/visualization.py b/src/visualization.py
index 69419d5..00f71f9 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -1,18 +1,25 @@
 """
-This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
+This module uses matplotlib to visualize processed data as graphs. The results are stored in
+report directory.
 The graphs are created after processing the data, for example with filtering and removing outliers.
 """
-import os.path
-from typing import Optional
 
+import os.path
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Union
+
+import matplotlib.dates as mdates
 import matplotlib.ticker
 import scipy.signal
 from matplotlib import pyplot as plt, font_manager
-import matplotlib.dates as mdates
 
-from constants import RES_DIR
-from processing import *
 from collect_others import get_covid_cases_us
+from constants import RES_DIR, REPORT_DIR
+from processing import load_tweets, load_user_sample
+from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \
+    tabulate_stats, get_statistics
 
 
 @dataclass()
@@ -163,7 +170,8 @@ class Sample:
             popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
 
         # Calculate frequency on date
-        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
+        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in
+                                date_covid_count}
 
         # Sort by relative popularity or frequency
         popularity.sort(key=lambda x: x.data, reverse=True)
@@ -244,7 +252,7 @@ def load_samples() -> list[Sample]:
     keys = ['en', 'zh', 'ja']
     pop_lang = [u.lang for u in users.most_popular]
     rand_lang = [u.lang for u in users.random]
-    Reporter('sample-demographics.md')\
+    Reporter('sample-demographics.md') \
         .table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
                 ['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
                ['Total', 'English', 'Chinese', 'Japanese'], False)
@@ -512,7 +520,7 @@ def report_all() -> None:
     Generate all reports
     
     Preconditions:
-        - Report has been 
+        - Twitter data have been downloaded and processed.
     """
     graph_load_font()
 

From 1eb28a62bb67aa1eb713aba686f1946362d59b8b Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:21:48 -0500
Subject: [PATCH 04/11] [O] Update dependencies and normalize requirements.txt

---
 requirements.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9dd91ba..7e0c562 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,31 +4,31 @@
 # Json5 is a human-readable json format that allows for things such as unquoted keys or comments.
 json5~=0.9.6
 # Tweepy is a python SDK for twitter
-tweepy==4.4.0
+tweepy~=4.4.0
 # requests is for getting html from a website URL
-requests==2.26.0
+requests~=2.26.0
 # beautifulsoup is used to extract data from html
-beautifulsoup4==4.10.0
+beautifulsoup4~=4.10.0
 
 #####################
 # Data Visualization
 # Print table data
-tabulate==0.8.9
+tabulate~=0.8.9
 # Draw local graphs
-matplotlib==3.5.0
+matplotlib~=3.5.1
 # Calculate data statistics
-numpy==1.21.4
+numpy~=1.21.4
 # Date utility for manipulating dates
 python-dateutil~=2.8.2
 # Scipy for transforming data. We used it for IIR filtering.
 scipy~=1.7.3
 # For serving the report website
-flask==2.0.2
+flask~=2.0.2
 
 ####################
 # Data Packing
 # 7zip packing utility for packing our processed data
-py7zr==0.16.3
+py7zr~=0.17.0
 
 #####################
 # Testing and code checking

From c1b04a741e567a23a2843dea2806a40c1a5a745f Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:37:00 -0500
Subject: [PATCH 05/11] [+] Class attributes

---
 src/collect_twitter.py |  1 +
 src/processing.py      | 30 +++++++++++++++++++-----------
 src/report.py          |  3 +++
 src/visualization.py   | 27 ++++++++++++++++++++-------
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/collect_twitter.py b/src/collect_twitter.py
index c5385b3..1008b3c 100644
--- a/src/collect_twitter.py
+++ b/src/collect_twitter.py
@@ -4,6 +4,7 @@ It contains functions related scraping users/tweets, including:
 - getting the tweets of a user
 - downloading many users by checking their followers and follower's followers, etc.
 """
+
 import json
 import math
 import os
diff --git a/src/processing.py b/src/processing.py
index cc237c9..799afae 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -27,14 +27,16 @@ class ProcessedUser(NamedTuple):
     example, using dataclass, the json for one UserPopularity object will be:
     {"username": "a", "popularity": 1, "num_postings": 1}, while using NamedTuple, the json will be:
     ["a", 1, 1], which saves an entire 42 bytes for each user.
+
+    Attributes:
+        - username: The Twitter user's screen name
+        - popularity: A measurement of a user's popularity, such as followers count
+        - num_postings: Number of tweets
+        - language: Language code in Twitter's language code format
     """
-    # Username
     username: str
-    # A measurement of a user's popularity, such as followers count
     popularity: int
-    # Number of tweets
     num_postings: int
-    # Language
     lang: str
 
 
@@ -107,6 +109,11 @@ class UserSample:
     """
     This is a data class storing our different samples.
 
+    Attributes:
+        - most_popular: Our sample of the most popular users on Twitter
+        - random: Our sample of random users on Twitter
+        - english_news: Our sample of news media accounts on Twitter
+
     Representation Invariants:
         - all(news != '' for news in self.english_news)
 
@@ -224,20 +231,21 @@ def load_user_sample() -> UserSample:
 
 class Posting(NamedTuple):
     """
-    Posting data stores the processed tweets data, and it contains info such as whether or not a
-    tweet is covid-related
+    Posting data stores the processed tweets data, and it contains info such as whether a tweet is
+    covid-related
+
+    Attributes:
+        - covid_related: True if the post is determined to be covid-related
+        - popularity: A measure of tweet popularity measured by comments + likes
+        - repost: Whether the post is a repost
+        - date: Posting date and time in ISO format ("YYYY-MM-DDThh-mm-ss")
 
     Representation Invariants:
         - popularity >= 0
-
     """
-    # Full text of the post's content
     covid_related: bool
-    # Popularity of the post
     popularity: int
-    # Is it a repost
     repost: bool
-    # Date in ISO format
     date: str
 
 
diff --git a/src/report.py b/src/report.py
index 242244a..242c6a8 100644
--- a/src/report.py
+++ b/src/report.py
@@ -20,6 +20,9 @@ def generate_report() -> str:
     """
     Compile the report document and generate a markdown report
 
+    Preconditions:
+        - RES_DIR exists, and contains the necessary resources used in this project.
+
     :return: Markdown report
     """
     # Load markdown
diff --git a/src/visualization.py b/src/visualization.py
index 00f71f9..3b2c227 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -30,9 +30,12 @@ class UserFloat:
     This is used for both COVID tweet frequency and popularity ratio data, because both of these
     are floating point data.
 
+    Attributes:
+        - name: Twitter user's screen name
+        - data: The float data that's associated with this user
+
     Representation Invariants:
         - self.name != ''
-
     """
     name: str
     data: float
@@ -42,23 +45,33 @@ class Sample:
     """
     A sample of many users, containing statistical data that will be used in graphs.
 
+    Attributes:
+        - name: Sample name
+        - users: List of user screen names in this sample
+        - user_freqs: Total frequencies of all posts for each user across all dates (sorted)
+        - user_pops: Total popularity ratios of all posts for each user across all dates (sorted)
+        - user_all_pop_avg: Average popularity of all u's posts
+        - user_date_covid_pop_avg: Average popularity of COVID tweets by a specific user on a date
+        (user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date})
+        - date_covid_freq: Total COVID-tweets frequency on a specific date for all users.
+        - dates: dates[i] = The i-th day since the first tweet
+        - date_freqs: date_freqs[i] = COVID frequency of all posts from all sampled users on date[i]
+        - date_pops: date_pops[i] = Average pop-ratio of all posts from all sampled users on date[i]
+
     Representation Invariants:
         - self.name != ''
         - all(name != '' for name in self.users)
-
     """
     name: str
     users: list[str]
-    # Total frequencies of all posts for each user across all dates (sorted)
+
     user_freqs: list[UserFloat]
-    # Total popularity ratios of all posts for each user across all dates (sorted)
     user_pops: list[UserFloat]
-    # Average popularity of all u's posts
     user_all_pop_avg: dict[str, float]
-    # Average popularity of COVID tweets by a specific user on a specific date
+
     # user_covid_tweets_pop[user][date] = Average popularity of COVID-posts by {user} on {date}
     user_date_covid_pop_avg: dict[str, dict[str, float]]
-    # Total COVID-tweets frequency on a specific date for all users.
+
     date_covid_freq: dict[str, float]
     # dates[i] = The i-th day since the first tweet
     dates: list[datetime]

From c50636bf7c01625cb5771417fba3a6e1a3c366d9 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:46:33 -0500
Subject: [PATCH 06/11] [F] Fix warnings

---
 src/collect_twitter.py | 10 +++++-----
 src/main.py            |  8 ++++----
 src/processing.py      | 22 +++++++++++-----------
 src/report.py          |  6 +++---
 src/utils.py           | 21 +++++++++++----------
 src/visualization.py   |  1 -
 6 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/collect_twitter.py b/src/collect_twitter.py
index 1008b3c..37744a0 100644
--- a/src/collect_twitter.py
+++ b/src/collect_twitter.py
@@ -68,7 +68,7 @@ def download_all_tweets(api: API, screen_name: str,
 
     :param api: Tweepy API object
     :param screen_name: Screen name of that individual
-    :param download_if_exists: Whether or not to download if it already exists (Default: False)
+    :param download_if_exists: Whether to download if it already exists (Default: False)
     :return: None
     """
     # Ensure directories exist
@@ -126,10 +126,10 @@ def download_all_tweets(api: API, screen_name: str,
 
 def download_users_start(api: API, start_point: str, n: float = math.inf) -> None:
     """
-    This function downloads n twitter users by using a friends-chain.
+    This function downloads n Twitter users by using a friends-chain.
 
-    Since there isn't an API or a database with all twitter users, we can't obtain a strict list
-    of all twitter users, nor can we obtain a list of strictly random or most popular twitter
+    Since there isn't an API or a database with all Twitter users, we can't obtain a strict list
+    of all Twitter users, nor can we obtain a list of strictly random or most popular Twitter
     users. Therefore, we use the method of follows chaining: we start from a specific individual,
     obtain their followers, and pick 6 random individuals from the friends list. Then, we repeat
     the process for the selected friends: we pick 6 random friends of the 6 random friends
@@ -149,7 +149,7 @@ def download_users_start(api: API, start_point: str, n: float = math.inf) -> Non
     https://developer.twitter.com/en/docs/twitter-api/v1/accounts-and-users/follow-search-get-users/api-reference/get-friends-list)
     This will limit the rate of requests to 15 requests in a 15-minute window, which is one request
     per minute. But it is actually the fastest method of downloading a wide range of users on
-    twitter because it can download a maximum of 200 users at a time while the API for downloading
+    Twitter because it can download a maximum of 200 users at a time while the API for downloading
     a single user is limited to only 900 queries per 15, which is only 60 users per minute.
 
     There is another API endpoint that might do the job, which is api.twitter.com/friends/ids (Doc:
diff --git a/src/main.py b/src/main.py
index a71ae6f..31a22a2 100644
--- a/src/main.py
+++ b/src/main.py
@@ -21,7 +21,7 @@ if __name__ == '__main__':
     # manually stop it when there are enough users)
     # download_users_start(api, 'voxdotcom')
 
-    # This task will run for a very very long time to obtain a large dataset of twitter users. If
+    # This task will run for a very, very long time to obtain a large dataset of Twitter users. If
     # you want to stop the process, you can resume it later using the following line:
     # download_users_resume_progress(api)
 
@@ -32,7 +32,7 @@ if __name__ == '__main__':
 
     #####################
     # Data processing - Step P1
-    # (After step C1) Process the downloaded twitter users, extract screen name, popularity, and
+    # (After step C1) Process the downloaded Twitter users, extract screen name, popularity, and
     # number of tweets data.
     # process_users()
 
@@ -44,7 +44,7 @@ if __name__ == '__main__':
 
     #####################
     # Data collection - Step C2.1
-    # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
+    # (After step P2) Load the downloaded Twitter users by popularity, and start downloading all
     # tweets from 500 of the most popular users. Takes around 2 hours.
     # for u in load_user_sample().most_popular:
     #     download_all_tweets(api, u.username)
@@ -60,7 +60,7 @@ if __name__ == '__main__':
     # (After step P2) Download all tweets from the news channels we selected.
     # for u in load_user_sample().english_news:
     #     download_all_tweets(api, u)
-    # Filter out news channels that have been blocked by twitter or don't exist anymore
+    # Filter out news channels that have been blocked by twitter or don't exist
     # filter_news_channels()
 
     #####################
diff --git a/src/processing.py b/src/processing.py
index 799afae..bc34aef 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -22,7 +22,7 @@ class ProcessedUser(NamedTuple):
     """
     User and popularity.
 
-    We use NamedTuple instead of dataclass because named tuples are easier to serialize in JSON and
+    We use NamedTuple instead of dataclass because named tuples are easier to serialize in JSON, and
     they require much less space in the stored json format because no key info is stored. For
     example, using dataclass, the json for one UserPopularity object will be:
     {"username": "a", "popularity": 1, "num_postings": 1}, while using NamedTuple, the json will be:
@@ -54,7 +54,7 @@ def process_users() -> None:
 
     # Loop through all the files
     for filename in os.listdir(f'{USER_DIR}/users'):
-        # Only check json files and ignore macos dot files
+        # Only check json files and ignore macOS dot files
         if filename.endswith('.json') and not filename.startswith('.'):
             # Read
             user = json.loads(read(f'{USER_DIR}/users/{filename}'))
@@ -190,7 +190,7 @@ def get_english_news_channels() -> list[str]:
     soup = BeautifulSoup(requests.get(url).text, 'html.parser')
     users = {h.text[1:] for h in soup.select('table tr td:nth-child(2) > a')}
 
-    # Combine two sets, ignoring case (since the ids in the 100 list are all lowercased)
+    # Combine two sets, ignoring case (since the ids in the 100 list are all lowercase)
     news_channels_lower = {n.lower() for n in news_channels}
     for u in users:
         if u not in news_channels_lower:
@@ -231,7 +231,7 @@ def load_user_sample() -> UserSample:
 
 class Posting(NamedTuple):
     """
-    Posting data stores the processed tweets data, and it contains info such as whether a tweet is
+    Posting data stores the processed tweets' data, and it contains info such as whether a tweet is
     covid-related
 
     Attributes:
@@ -251,19 +251,19 @@ class Posting(NamedTuple):
 
 def process_tweets() -> None:
     """
-    Process tweets, reduce the tweets data to only a few fields defined in the Posting class. These
-    include whether or not the tweet is covid-related, how popular is the tweet, if it is a repost,
-    and its date. The processed tweet does not contain its content.
+    Process tweets, reduce the tweets' data to only a few fields defined in the Posting class. These
+    include whether the tweet is covid-related, how popular is the tweet, if it is a repost, and its
+    date. The processed tweet does not contain its content.
 
     If a user's tweets is already processed, this function will skip over that user's data.
 
-    This function will save the processed tweets data to <tweets_dir>/processed/<username>.json
+    This function will save the processed tweets' data to <tweets_dir>/processed/<username>.json
 
     :return: None
     """
     # Loop through all the files
     for filename in os.listdir(f'{TWEETS_DIR}/user'):
-        # Only check json files and ignore macos dot files
+        # Only check json files and ignore macOS dot files
         if filename.endswith('.json') and not filename.startswith('.'):
             # Check if already processed
             if os.path.isfile(f'{TWEETS_DIR}/processed/{filename}'):
@@ -297,8 +297,8 @@ def load_tweets(username: str) -> list[Posting]:
 def is_covid_related(text: str) -> bool:
     """
     Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
-    though we know that not all posts with covid-related words are covid-related posts, this is our
-    current best method of classification.
+    though we know that not all posts with covid-related words are covid-related posts, this is
+    currently our best method of classification.
 
     :param text: Text content
     :return: Whether the text is covid related
diff --git a/src/report.py b/src/report.py
index 242c6a8..1ab6d96 100644
--- a/src/report.py
+++ b/src/report.py
@@ -60,7 +60,7 @@ def generate_report() -> str:
 
         # Handle errors. (It prompts "too broad an exception clause" but I actually need to catch
         # every possible exception.)
-        except Exception as e:
+        except Exception:
             md[i] = f"<pre class=\"error\">" \
                     f"\nInvalid @include statement. \n{traceback.format_exc()}</pre>"
 
@@ -73,7 +73,7 @@ def generate_html() -> str:
 
     :return: HTML string
     """
-    # Generate markdown report and JSON encode it (which works as JS code! amazing
+    # Generate markdown report and JSON encode it (which works as JS code! amazing)
     md_json = json.dumps({'content': generate_report()})
     # Inject into HTML
     html = read(os.path.join(RES_DIR, 'report_page.html')) \
@@ -122,7 +122,7 @@ def serve_report() -> None:
     @app.route('/<path:path>')
     def res(path: str) -> Response:
         """
-        Resources endpoint. This maps report queries to the report directory
+        Resources endpoint. This function maps report queries to the report directory
 
         :param path: Path of the resource
         :return: File resource or 404
diff --git a/src/utils.py b/src/utils.py
index feaeda4..32cd931 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -181,7 +181,7 @@ def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float
         - len(points) > 0
 
     :param points: Input points list
-    :param z_threshold: Z threshold for identifying whether or not a point is an outlier
+    :param z_threshold: Z threshold for identifying whether a point is an outlier
     :return: List with outliers removed
     """
     x = np.array(points)
@@ -296,7 +296,7 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
         - end_date starts with the "YYYY-MM-DD" format
 
     :param start_date: Start date in "YYYY-MM-DD" format
-    :param end_date: End date in "YYYY-MM-DD" format
+    :param end_date: Ending date in "YYYY-MM-DD" format
     :return: Generator for looping through the dates one day at a time.
     """
     start = parse_date_only(start_date)
@@ -308,7 +308,7 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime],
 def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
                  default: float = 0) -> list[float]:
     """
-    Takes y-axis data in the form of a mapping of date to values, and returns a list of all the
+    Takes y-axis data in the form of a mapping of dates to values, and returns a list of all the
     values mapped to the date in dates. If a date in dates isn't in y, then the default values is
     used instead.
 
@@ -325,7 +325,7 @@ def map_to_dates(y: dict[str, Union[int, float]], dates: list[str],
 
 def filter_days_avg(y: list[float], n: int) -> list[float]:
     """
-    Filter y by taking an average over a n-days window. If n = 0, then return y without processing.
+    Filter y by taking an average over an n-days window. If n = 0, then return y without processing.
 
     Preconditions:
         - n % 2 == 1
@@ -351,12 +351,12 @@ def filter_days_avg(y: list[float], n: int) -> list[float]:
 
     ret = []
     for i in range(len(y)):
-        l, r = i - radius, i + radius
-        l = max(0, l)  # avoid index out of bounds by "extending" first/last element
-        r = min(r, len(y) - 1)
-        current_sum += y[r]  # extend sliding window
+        left, right = i - radius, i + radius
+        left = max(0, left)  # avoid index out of bounds by "extending" first/last element
+        right = min(right, len(y) - 1)
+        current_sum += y[right]  # extend sliding window
         ret.append(current_sum / n)
-        current_sum -= y[l]  # remove old values
+        current_sum -= y[left]  # remove old values
     return ret
 
 
@@ -377,8 +377,9 @@ def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float
             output[i] = 0
         else:
             output[i] = numerator[i] / denominator[i]
-    # This marks it as incorrect type but it's actually not incorrect type, just because numpy
+    # This marks it as incorrect type, but it's actually not incorrect type, just because numpy
     # doesn't specify its return types
+    # noinspection PyTypeChecker
     return output.tolist()
 
 
diff --git a/src/visualization.py b/src/visualization.py
index 3b2c227..9c4fb18 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -438,7 +438,6 @@ def graph_line_plot(x: list[datetime], y: Union[list[float], list[list[float]]],
         if freq:
             cases = get_covid_cases_us()
             c = map_to_dates(cases.cases, [d.isoformat()[:10] for d in x])
-            # c = scipy.signal.savgol_filter(c, 45, 2)
             c = filter_days_avg(c, 7)
             c = scipy.signal.lfilter([1.0 / n] * n, 1, c)
 

From 70df2ce43a95f4c6b4f3ac3d217d9896fdb4671c Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 17:58:08 -0500
Subject: [PATCH 07/11] [F] Fix typos in report document

---
 .gitignore                       | 1 +
 src/resources/report_document.md | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index be8cda8..6e383b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -128,3 +128,4 @@ config.json5
 data/
 /report/
 /src/report
+.DS_Store
diff --git a/src/resources/report_document.md b/src/resources/report_document.md
index 4a5e761..ac6377f 100644
--- a/src/resources/report_document.md
+++ b/src/resources/report_document.md
@@ -25,7 +25,7 @@ We also counted the number of people speaking each language:
 
 2. We also downloaded all tweets from our sampled users through the user-timeline API [(documentation)](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline). Due to rate limiting, the program took around 16 hours to finish, and we obtained 7.7 GB of raw data (uncompressed). During processing, for each tweet, we extracted only its date, popularity (likes + retweets), whether it is a retweet, and whether it is COVID-related. The text of the tweets are not retained, and the processed data directory `data/twitter/user-tweets/processed/` is 141.6 MB in total.
 
-3. We also used the COVID-19 daily cases data published by New York Times [[3]](#ref3) to compare with peaks and throughs in our frequency over date graph.
+3. We also used the COVID-19 daily cases data published by New York Times [[3]](#ref3) to compare with peaks and through in our frequency over date graph.
 
 ## Computation & Filtering
 
@@ -196,7 +196,7 @@ These findings might not be surprising, but they might have again demonstrated p
 
 <a id="ref1"></a>
 
-[1] Bremmen, N. (2010, September 3). The 100 most influential news media twitter accounts. _Memeburn_. Retrieved November 27, 2021, from https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/.
+[1] Bremmen, N. (2010, September 3). The 100 most influential news media Twitter accounts. _Memeburn_. Retrieved November 27, 2021, from https://memeburn.com/2010/09/the-100-most-influential-news-media-twitter-accounts/.
 
 <a id="ref2"></a>
 

From b7e11cb45bcf5eb747a655603ba8658cf5219ea9 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 18:01:48 -0500
Subject: [PATCH 08/11] [+] Pack resources

---
 src/processing.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/processing.py b/src/processing.py
index bc34aef..67b0e0d 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -14,7 +14,7 @@ import requests
 from bs4 import BeautifulSoup
 from py7zr import SevenZipFile
 
-from constants import DATA_DIR, TWEETS_DIR, USER_DIR
+from constants import DATA_DIR, TWEETS_DIR, USER_DIR, RES_DIR
 from utils import read, debug, write, json_stringify
 
 
@@ -338,3 +338,9 @@ def pack_data() -> None:
         for p in processed_dirs:
             debug(f'- Packing {p}')
             z.writeall(DATA_DIR + p)
+
+    # Pack resources
+    debug('Packing resources...')
+    with SevenZipFile(f'{packed_dir}/resources.7z', 'w') as z:
+        z: SevenZipFile = z
+        z.writeall(RES_DIR)

From 84bfca5e62a8edfcd5abc01b9a4f3a67d3d14709 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 18:36:32 -0500
Subject: [PATCH 09/11] [+] Create script to pack for markus submission

---
 src/processing.py | 57 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/src/processing.py b/src/processing.py
index 67b0e0d..3515874 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -5,6 +5,8 @@ users, creating samples of users, filtering news channels, and processing tweets
 import json
 import os
 import random
+import sys
+import zipfile
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
@@ -322,25 +324,58 @@ def is_covid_related(text: str) -> bool:
 
 def pack_data() -> None:
     """
-    This function packs processed data and raw data separately.
+    This function packs processed data and raw data separately, and it also packs the data ready for
+    submission on MarkUs
 
     :return: None
     """
     packed_dir = f'{DATA_DIR}/packed'
     Path(packed_dir).mkdir(parents=True, exist_ok=True)
+    packed_data = f'{packed_dir}/processed.7z'
+    packed_res = f'{packed_dir}/resources.7z'
 
-    # Pack data for processed.
-    debug('Packing data...')
-    processed_dirs = ['/twitter/user/meta', '/twitter/user/processed',
-                      '/twitter/user-tweets/processed']
-    with SevenZipFile(f'{packed_dir}/processed.7z', 'w') as z:
-        z: SevenZipFile = z
-        for p in processed_dirs:
-            debug(f'- Packing {p}')
-            z.writeall(DATA_DIR + p)
+    # Pack processed data (Since packing this takes a long time, we decided to not overwrite it for
+    # every run. This is also because the processed data hasn't changed since Nov 28 when the
+    # project is mostly finished, and there is no need to re-pack data every time, whereas the
+    # resources and sources might change after every update) So, delete the packed 7z file if you
+    # want to repack.
+    if not os.path.isfile(packed_data):
+        debug('Packing data...')
+        processed_dirs = ['/twitter/user/meta', '/twitter/user/processed',
+                          '/twitter/user-tweets/processed']
+        with SevenZipFile(packed_data, 'w') as z:
+            z: SevenZipFile = z
+            for p in processed_dirs:
+                debug(f'- Packing {p}')
+                z.writeall(DATA_DIR + p)
 
     # Pack resources
     debug('Packing resources...')
-    with SevenZipFile(f'{packed_dir}/resources.7z', 'w') as z:
+    with SevenZipFile(packed_res, 'w') as z:
         z: SevenZipFile = z
         z.writeall(RES_DIR)
+
+    # Pack MarkUs submission
+    # Even though 7zip has much better compression rate than zip, MarkUs only supports zip.
+    debug('Packing source code...')
+    with zipfile.ZipFile(f'{packed_dir}/markus.zip', 'w') as zf:
+        z: zipfile.ZipFile = zf
+
+        # Add sources
+        src_path = Path(os.path.realpath(__file__)).parent
+        for f in os.listdir(src_path):
+            if not os.path.isdir(f):
+                z.write(f)
+
+        # Add packed resource
+        z.write(packed_res, 'resources.7z')
+
+        # Add report tex
+        z.write(os.path.join(src_path, '../writing/report/project_report.tex'), 'project_report.tex')
+        z.write(os.path.join(src_path, '../writing/report/project_report.pdf'), 'project_report.pdf')
+
+    # Open packed location (Since there isn't a platform-independent way of doing this, we currently
+    # only support macOS)
+    if sys.platform == 'darwin':
+        os.system(f'open {Path(packed_dir).absolute()}')
+

From a65116d2549a69d570d60c0cbe4f8e1ade3ce3d1 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 19:32:01 -0500
Subject: [PATCH 10/11] [O] Ignore .DS_Store in packing

---
 src/processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/processing.py b/src/processing.py
index 3515874..63bd363 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -364,7 +364,7 @@ def pack_data() -> None:
         # Add sources
         src_path = Path(os.path.realpath(__file__)).parent
         for f in os.listdir(src_path):
-            if not os.path.isdir(f):
+            if not os.path.isdir(f) and f != '.DS_Store':
                 z.write(f)
 
         # Add packed resource

From 1dd0ebe56c07f3a62efe24315f51e68898a8f1f5 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 13 Dec 2021 19:32:24 -0500
Subject: [PATCH 11/11] [O] Ignore dot files

---
 src/processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/processing.py b/src/processing.py
index 63bd363..116da65 100644
--- a/src/processing.py
+++ b/src/processing.py
@@ -364,7 +364,7 @@ def pack_data() -> None:
         # Add sources
         src_path = Path(os.path.realpath(__file__)).parent
         for f in os.listdir(src_path):
-            if not os.path.isdir(f) and f != '.DS_Store':
+            if not os.path.isdir(f) and f != '.DS_Store' and not f.startswith('._'):
                 z.write(f)
 
         # Add packed resource