[O] Reformat code, expand imports
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
This module uses web requests to collect and process other data we are using in our analysis.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import requests
|
||||
@@ -12,12 +13,14 @@ class CasesData:
|
||||
A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths
|
||||
on that day.
|
||||
|
||||
Attributes:
|
||||
- cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
|
||||
- deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date
|
||||
|
||||
Representation Invariants:
|
||||
- all(x >= 0 for x in self.cases.values())
|
||||
- all(x >= 0 for x in self.deaths.values())
|
||||
|
||||
"""
|
||||
# cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
|
||||
cases: dict[str, float]
|
||||
deaths: dict[str, float]
|
||||
|
||||
|
||||
+6
-15
@@ -4,16 +4,18 @@ It contains functions related scraping users/tweets, including:
|
||||
- getting the tweets of a user
|
||||
- downloading many users by checking their followers and follower's followers, etc.
|
||||
"""
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
import tweepy
|
||||
from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound
|
||||
|
||||
from constants import TWEETS_DIR, USER_DIR
|
||||
from utils import *
|
||||
from utils import Config, debug, calculate_rate_delay, write, json_stringify, read
|
||||
|
||||
|
||||
def tweepy_login(conf: Config) -> tweepy.API:
|
||||
@@ -57,7 +59,8 @@ def download_all_tweets(api: API, screen_name: str,
|
||||
Twitter API Reference
|
||||
--------
|
||||
It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
|
||||
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
|
||||
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get
|
||||
-statuses-user_timeline)
|
||||
This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
|
||||
limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
|
||||
safe, this function uses a rate limit of 60 rpm.
|
||||
@@ -294,15 +297,3 @@ def download_users_execute(api: API, n: float,
|
||||
|
||||
# Rate limit
|
||||
time.sleep(rate_delay)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# python_ta.check_all(config={
|
||||
# 'max-line-length': 100,
|
||||
# 'disable': ['R1705', 'C0200', 'E9998', 'E9999']
|
||||
# })
|
||||
|
||||
config = load_config('config.json5')
|
||||
tweepy_api = tweepy_login(config)
|
||||
# download_users_start(tweepy_api, 'sauricat')
|
||||
download_users_resume_progress(tweepy_api)
|
||||
|
||||
+5
-8
@@ -2,11 +2,12 @@
|
||||
This module is the main module of our program which runs different functions in different modules
|
||||
by steps.
|
||||
"""
|
||||
from visualization import *
|
||||
from collect_twitter import *
|
||||
from report import serve_report
|
||||
from utils import *
|
||||
|
||||
from collect_twitter import *
|
||||
from processing import *
|
||||
from report import *
|
||||
from utils import *
|
||||
from visualization import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Load config and create API
|
||||
@@ -41,10 +42,6 @@ if __name__ == '__main__':
|
||||
# criteria as our sample, also find news channels
|
||||
# select_user_sample()
|
||||
|
||||
# Just curious, who are the 20 most popular individuals on twitter?
|
||||
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
|
||||
# headers=['Name', 'Followers']))
|
||||
|
||||
#####################
|
||||
# Data collection - Step C2.1
|
||||
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
|
||||
|
||||
+6
-3
@@ -2,17 +2,20 @@
|
||||
Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
|
||||
users, creating samples of users, filtering news channels, and processing tweets for file storage.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import NamedTuple
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
import dateutil.parser
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from py7zr import SevenZipFile
|
||||
|
||||
from constants import DATA_DIR, TWEETS_DIR, USER_DIR
|
||||
from utils import *
|
||||
from utils import read, debug, write, json_stringify
|
||||
|
||||
|
||||
class ProcessedUser(NamedTuple):
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
This module generates report HTML and serves it in an HTTP server.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os.path
|
||||
import shutil
|
||||
|
||||
+7
-3
@@ -1,4 +1,5 @@
|
||||
"""This module contains useful functions and classes, including:
|
||||
"""
|
||||
This module contains useful functions and classes, including:
|
||||
- debug messages
|
||||
- file I/O
|
||||
- statistics functions, removing outliers and averaging values over a period
|
||||
@@ -14,7 +15,7 @@ import statistics
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, date, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Union, NamedTuple, Any, Generator
|
||||
from typing import Union, Any, Generator
|
||||
|
||||
import json5
|
||||
import numpy as np
|
||||
@@ -49,7 +50,8 @@ class Config:
|
||||
|
||||
def load_config(path: str = 'config.json5') -> Config:
|
||||
"""
|
||||
Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.
|
||||
Load config using JSON5, from either the local file ~/config.json5 or from the environment
|
||||
variable named config.
|
||||
|
||||
:param path: Path of the config file (Default: config.json5)
|
||||
:return: Config object
|
||||
@@ -242,6 +244,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
|
||||
:param percent: Whether the numbers are percentages
|
||||
:return: Table for tabulate
|
||||
"""
|
||||
|
||||
def num(n: float) -> str:
|
||||
return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'
|
||||
|
||||
@@ -384,6 +387,7 @@ class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
An improvement to the json.JSONEncoder class, which supports:
|
||||
encoding for dataclasses, encoding for datetime, and sets
|
||||
"""
|
||||
|
||||
def default(self, o):
|
||||
|
||||
# Support encoding dataclasses
|
||||
|
||||
+17
-9
@@ -1,18 +1,25 @@
|
||||
"""
|
||||
This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
|
||||
This module uses matplotlib to visualize processed data as graphs. The results are stored in
|
||||
report directory.
|
||||
The graphs are created after processing the data, for example with filtering and removing outliers.
|
||||
"""
|
||||
import os.path
|
||||
from typing import Optional
|
||||
|
||||
import os.path
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import matplotlib.dates as mdates
|
||||
import matplotlib.ticker
|
||||
import scipy.signal
|
||||
from matplotlib import pyplot as plt, font_manager
|
||||
import matplotlib.dates as mdates
|
||||
|
||||
from constants import RES_DIR
|
||||
from processing import *
|
||||
from collect_others import get_covid_cases_us
|
||||
from constants import RES_DIR, REPORT_DIR
|
||||
from processing import load_tweets, load_user_sample
|
||||
from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \
|
||||
tabulate_stats, get_statistics
|
||||
|
||||
|
||||
@dataclass()
|
||||
@@ -163,7 +170,8 @@ class Sample:
|
||||
popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))
|
||||
|
||||
# Calculate frequency on date
|
||||
self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
|
||||
self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in
|
||||
date_covid_count}
|
||||
|
||||
# Sort by relative popularity or frequency
|
||||
popularity.sort(key=lambda x: x.data, reverse=True)
|
||||
@@ -244,7 +252,7 @@ def load_samples() -> list[Sample]:
|
||||
keys = ['en', 'zh', 'ja']
|
||||
pop_lang = [u.lang for u in users.most_popular]
|
||||
rand_lang = [u.lang for u in users.random]
|
||||
Reporter('sample-demographics.md')\
|
||||
Reporter('sample-demographics.md') \
|
||||
.table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
|
||||
['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
|
||||
['Total', 'English', 'Chinese', 'Japanese'], False)
|
||||
@@ -512,7 +520,7 @@ def report_all() -> None:
|
||||
Generate all reports
|
||||
|
||||
Preconditions:
|
||||
- Report has been
|
||||
- Twitter data have been downloaded and processed.
|
||||
"""
|
||||
graph_load_font()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user