diff --git a/src/collect_others.py b/src/collect_others.py index 3af7d8d..e40d439 100644 --- a/src/collect_others.py +++ b/src/collect_others.py @@ -1,6 +1,7 @@ """ This module uses web requests to collect and process other data we are using in our analysis. """ + from dataclasses import dataclass import requests @@ -12,12 +13,14 @@ class CasesData: A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths on that day. + Attributes: + - cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date + - deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date + Representation Invariants: - all(x >= 0 for x in self.cases.values()) - all(x >= 0 for x in self.deaths.values()) - """ - # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date cases: dict[str, float] deaths: dict[str, float] diff --git a/src/collect_twitter.py b/src/collect_twitter.py index 1589c3b..c5385b3 100644 --- a/src/collect_twitter.py +++ b/src/collect_twitter.py @@ -4,16 +4,18 @@ It contains functions related scraping users/tweets, including: - getting the tweets of a user - downloading many users by checking their followers and follower's followers, etc. """ +import json import math +import os import random import time -from typing import List +from typing import List, Union import tweepy from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound from constants import TWEETS_DIR, USER_DIR -from utils import * +from utils import Config, debug, calculate_rate_delay, write, json_stringify, read def tweepy_login(conf: Config) -> tweepy.API: @@ -57,7 +59,8 @@ def download_all_tweets(api: API, screen_name: str, Twitter API Reference -------- It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation: - https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline) + https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get + -statuses-user_timeline) This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be safe, this function uses a rate limit of 60 rpm. @@ -294,15 +297,3 @@ def download_users_execute(api: API, n: float, # Rate limit time.sleep(rate_delay) - - -if __name__ == '__main__': - # python_ta.check_all(config={ - # 'max-line-length': 100, - # 'disable': ['R1705', 'C0200', 'E9998', 'E9999'] - # }) - - config = load_config('config.json5') - tweepy_api = tweepy_login(config) - # download_users_start(tweepy_api, 'sauricat') - download_users_resume_progress(tweepy_api) diff --git a/src/main.py b/src/main.py index db52016..a71ae6f 100644 --- a/src/main.py +++ b/src/main.py @@ -2,11 +2,12 @@ This module is the main module of our program which runs different functions in different modules by steps. """ -from visualization import * -from collect_twitter import * -from report import serve_report -from utils import * +from collect_twitter import * +from processing import * +from report import * +from utils import * +from visualization import * if __name__ == '__main__': # Load config and create API @@ -41,10 +42,6 @@ if __name__ == '__main__': # criteria as our sample, also find news channels # select_user_sample() - # Just curious, who are the 20 most popular individuals on twitter? - # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]), - # headers=['Name', 'Followers'])) - ##################### # Data collection - Step C2.1 # (After step P2) Load the downloaded twitter users by popularity, and start downloading all diff --git a/src/processing.py b/src/processing.py index 288010d..cc237c9 100644 --- a/src/processing.py +++ b/src/processing.py @@ -2,17 +2,20 @@ Processes data downloaded from the Twitter API. Processing consists of calculating popularity of users, creating samples of users, filtering news channels, and processing tweets for file storage. """ +import json +import os import random -from typing import NamedTuple from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import NamedTuple -import dateutil.parser import requests from bs4 import BeautifulSoup from py7zr import SevenZipFile from constants import DATA_DIR, TWEETS_DIR, USER_DIR -from utils import * +from utils import read, debug, write, json_stringify class ProcessedUser(NamedTuple): diff --git a/src/report.py b/src/report.py index 1d37fbc..242244a 100644 --- a/src/report.py +++ b/src/report.py @@ -1,6 +1,7 @@ """ This module generates report HTML and serves it in an HTTP server. """ + import json import os.path import shutil diff --git a/src/utils.py b/src/utils.py index 29f0b05..feaeda4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,4 +1,5 @@ -"""This module contains useful functions and classes, including: +""" +This module contains useful functions and classes, including: - debug messages - file I/O - statistics functions, removing outliers and averaging values over a period @@ -14,7 +15,7 @@ import statistics from dataclasses import dataclass from datetime import datetime, date, timedelta from pathlib import Path -from typing import Union, NamedTuple, Any, Generator +from typing import Union, Any, Generator import json5 import numpy as np @@ -49,7 +50,8 @@ class Config: def load_config(path: str = 'config.json5') -> Config: """ - Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config. + Load config using JSON5, from either the local file ~/config.json5 or from the environment + variable named config. :param path: Path of the config file (Default: config.json5) :return: Config object @@ -242,6 +244,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]] :param percent: Whether the numbers are percentages :return: Table for tabulate """ + def num(n: float) -> str: return f'{n:.2f}' if not percent else f'{n * 100:.1f}%' @@ -384,6 +387,7 @@ class EnhancedJSONEncoder(json.JSONEncoder): An improvement to the json.JSONEncoder class, which supports: encoding for dataclasses, encoding for datetime, and sets """ + def default(self, o): # Support encoding dataclasses diff --git a/src/visualization.py b/src/visualization.py index 69419d5..00f71f9 100644 --- a/src/visualization.py +++ b/src/visualization.py @@ -1,18 +1,25 @@ """ -This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory. +This module uses matplotlib to visualize processed data as graphs. The results are stored in +report directory. The graphs are created after processing the data, for example with filtering and removing outliers. """ -import os.path -from typing import Optional +import os.path +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional, Union + +import matplotlib.dates as mdates import matplotlib.ticker import scipy.signal from matplotlib import pyplot as plt, font_manager -import matplotlib.dates as mdates -from constants import RES_DIR -from processing import * from collect_others import get_covid_cases_us +from constants import RES_DIR, REPORT_DIR +from processing import load_tweets, load_user_sample +from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \ + tabulate_stats, get_statistics @dataclass() @@ -163,7 +170,8 @@ class Sample: popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg)) # Calculate frequency on date - self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count} + self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in + date_covid_count} # Sort by relative popularity or frequency popularity.sort(key=lambda x: x.data, reverse=True) @@ -244,7 +252,7 @@ def load_samples() -> list[Sample]: keys = ['en', 'zh', 'ja'] pop_lang = [u.lang for u in users.most_popular] rand_lang = [u.lang for u in users.random] - Reporter('sample-demographics.md')\ + Reporter('sample-demographics.md') \ .table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys], ['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]], ['Total', 'English', 'Chinese', 'Japanese'], False) @@ -512,7 +520,7 @@ def report_all() -> None: Generate all reports Preconditions: - - Report has been + - Twitter data have been downloaded and processed. """ graph_load_font()