[O] Reformat code, expand imports

This commit is contained in:
Hykilpikonna
2021-12-13 17:20:14 -05:00
parent 8e2550097f
commit 87eaa28794
7 changed files with 47 additions and 40 deletions
+5 -2
View File
@@ -1,6 +1,7 @@
"""
This module uses web requests to collect and process other data we are using in our analysis.
"""
from dataclasses import dataclass
import requests
@@ -12,12 +13,14 @@ class CasesData:
A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths
on that day.
Attributes:
- cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
- deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date
Representation Invariants:
- all(x >= 0 for x in self.cases.values())
- all(x >= 0 for x in self.deaths.values())
"""
# cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
cases: dict[str, float]
deaths: dict[str, float]
+6 -15
View File
@@ -4,16 +4,18 @@ It contains functions related scraping users/tweets, including:
- getting the tweets of a user
- downloading many users by checking their followers and follower's followers, etc.
"""
import json
import math
import os
import random
import time
from typing import List
from typing import List, Union
import tweepy
from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound
from constants import TWEETS_DIR, USER_DIR
from utils import *
from utils import Config, debug, calculate_rate_delay, write, json_stringify, read
def tweepy_login(conf: Config) -> tweepy.API:
@@ -57,7 +59,8 @@ def download_all_tweets(api: API, screen_name: str,
Twitter API Reference
--------
It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get
-statuses-user_timeline)
This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
safe, this function uses a rate limit of 60 rpm.
@@ -294,15 +297,3 @@ def download_users_execute(api: API, n: float,
# Rate limit
time.sleep(rate_delay)
if __name__ == '__main__':
# python_ta.check_all(config={
# 'max-line-length': 100,
# 'disable': ['R1705', 'C0200', 'E9998', 'E9999']
# })
config = load_config('config.json5')
tweepy_api = tweepy_login(config)
# download_users_start(tweepy_api, 'sauricat')
download_users_resume_progress(tweepy_api)
+5 -8
View File
@@ -2,11 +2,12 @@
This module is the main module of our program which runs different functions in different modules
by steps.
"""
from visualization import *
from collect_twitter import *
from report import serve_report
from utils import *
from collect_twitter import *
from processing import *
from report import *
from utils import *
from visualization import *
if __name__ == '__main__':
# Load config and create API
@@ -41,10 +42,6 @@ if __name__ == '__main__':
# criteria as our sample, also find news channels
# select_user_sample()
# Just curious, who are the 20 most popular individuals on twitter?
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
# headers=['Name', 'Followers']))
#####################
# Data collection - Step C2.1
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
+6 -3
View File
@@ -2,17 +2,20 @@
Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
users, creating samples of users, filtering news channels, and processing tweets for file storage.
"""
import json
import os
import random
from typing import NamedTuple
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import NamedTuple
import dateutil.parser
import requests
from bs4 import BeautifulSoup
from py7zr import SevenZipFile
from constants import DATA_DIR, TWEETS_DIR, USER_DIR
from utils import *
from utils import read, debug, write, json_stringify
class ProcessedUser(NamedTuple):
+1
View File
@@ -1,6 +1,7 @@
"""
This module generates report HTML and serves it in an HTTP server.
"""
import json
import os.path
import shutil
+7 -3
View File
@@ -1,4 +1,5 @@
"""This module contains useful functions and classes, including:
"""
This module contains useful functions and classes, including:
- debug messages
- file I/O
- statistics functions, removing outliers and averaging values over a period
@@ -14,7 +15,7 @@ import statistics
from dataclasses import dataclass
from datetime import datetime, date, timedelta
from pathlib import Path
from typing import Union, NamedTuple, Any, Generator
from typing import Union, Any, Generator
import json5
import numpy as np
@@ -49,7 +50,8 @@ class Config:
def load_config(path: str = 'config.json5') -> Config:
"""
Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.
Load config using JSON5, from either the local file ~/config.json5 or from the environment
variable named config.
:param path: Path of the config file (Default: config.json5)
:return: Config object
@@ -242,6 +244,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
:param percent: Whether the numbers are percentages
:return: Table for tabulate
"""
def num(n: float) -> str:
return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
@@ -384,6 +387,7 @@ class EnhancedJSONEncoder(json.JSONEncoder):
An improvement to the json.JSONEncoder class, which supports:
encoding for dataclasses, encoding for datetime, and sets
"""
def default(self, o):
# Support encoding dataclasses
+17 -9
View File
@@ -1,18 +1,25 @@
"""
This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
This module uses matplotlib to visualize processed data as graphs. The results are stored in
report directory.
The graphs are created after processing the data, for example with filtering and removing outliers.
"""
import os.path
from typing import Optional
import os.path
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Optional, Union
import matplotlib.dates as mdates
import matplotlib.ticker
import scipy.signal
from matplotlib import pyplot as plt, font_manager
import matplotlib.dates as mdates
from constants import RES_DIR
from processing import *
from collect_others import get_covid_cases_us
from constants import RES_DIR, REPORT_DIR
from processing import load_tweets, load_user_sample
from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \
tabulate_stats, get_statistics
@dataclass()
@@ -163,7 +170,8 @@ class Sample:
popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
# Calculate frequency on date
self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in
date_covid_count}
# Sort by relative popularity or frequency
popularity.sort(key=lambda x: x.data, reverse=True)
@@ -244,7 +252,7 @@ def load_samples() -> list[Sample]:
keys = ['en', 'zh', 'ja']
pop_lang = [u.lang for u in users.most_popular]
rand_lang = [u.lang for u in users.random]
Reporter('sample-demographics.md')\
Reporter('sample-demographics.md') \
.table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
['Total', 'English', 'Chinese', 'Japanese'], False)
@@ -512,7 +520,7 @@ def report_all() -> None:
Generate all reports
Preconditions:
- Report has been
- Twitter data have been downloaded and processed.
"""
graph_load_font()