[O] Reformat code, expand imports

2021-12-13 17:20:14 -05:00
parent 8e2550097f
commit 87eaa28794
7 changed files with 47 additions and 40 deletions
@@ -1,6 +1,7 @@
 """
 This module uses web requests to collect and process other data we are using in our analysis.
 """
+
 from dataclasses import dataclass

 import requests
@@ -12,12 +13,14 @@ class CasesData:
    A dataclass that stores a mapping of date to cases on that day and a mapping of date to deaths
    on that day.

+    Attributes:
+        - cases: cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
+        - deaths: deaths[date in "YYYY-MM-DD"] = 7-day average of deaths around that date
+
    Representation Invariants:
        - all(x >= 0 for x in self.cases.values())
        - all(x >= 0 for x in self.deaths.values())
-
    """
-    # cases[date in "YYYY-MM-DD"] = 7-day average of cases around that date
    cases: dict[str, float]
    deaths: dict[str, float]

@@ -4,16 +4,18 @@ It contains functions related scraping users/tweets, including:
 - getting the tweets of a user
 - downloading many users by checking their followers and follower's followers, etc.
 """
+import json
 import math
+import os
 import random
 import time
-from typing import List
+from typing import List, Union

 import tweepy
 from tweepy import API, TooManyRequests, User, Tweet, Unauthorized, NotFound

 from constants import TWEETS_DIR, USER_DIR
-from utils import *
+from utils import Config, debug, calculate_rate_delay, write, json_stringify, read


 def tweepy_login(conf: Config) -> tweepy.API:
@@ -57,7 +59,8 @@ def download_all_tweets(api: API, screen_name: str,
    Twitter API Reference
    --------
    It will be using the API endpoint api.twitter.com/statuses/user_timeline (Documentation:
-    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get-statuses-user_timeline)
+    https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/api-reference/get
+    -statuses-user_timeline)
    This endpoint has a rate limit of 900 requests / 15-minutes = 60 rpm for user auth, and it has a
    limit of 100,000 requests / 24 hours = 69.44 rpm independent of authentication method. To be
    safe, this function uses a rate limit of 60 rpm.
@@ -294,15 +297,3 @@ def download_users_execute(api: API, n: float,

        # Rate limit
        time.sleep(rate_delay)
-
-
-if __name__ == '__main__':
-    # python_ta.check_all(config={
-    #     'max-line-length': 100,
-    #     'disable': ['R1705', 'C0200', 'E9998', 'E9999']
-    # })
-
-    config = load_config('config.json5')
-    tweepy_api = tweepy_login(config)
-    # download_users_start(tweepy_api, 'sauricat')
-    download_users_resume_progress(tweepy_api)
@@ -2,11 +2,12 @@
 This module is the main module of our program which runs different functions in different modules
 by steps.
 """
-from visualization import *
-from collect_twitter import *
-from report import serve_report
-from utils import *

+from collect_twitter import *
+from processing import *
+from report import *
+from utils import *
+from visualization import *

 if __name__ == '__main__':
    # Load config and create API
@@ -41,10 +42,6 @@ if __name__ == '__main__':
    # criteria as our sample, also find news channels
    # select_user_sample()

-    # Just curious, who are the 20 most popular individuals on twitter?
-    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
-    #                headers=['Name', 'Followers']))
-
    #####################
    # Data collection - Step C2.1
    # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
@@ -2,17 +2,20 @@
 Processes data downloaded from the Twitter API. Processing consists of calculating popularity of
 users, creating samples of users, filtering news channels, and processing tweets for file storage.
 """
+import json
+import os
 import random
-from typing import NamedTuple
 from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import NamedTuple

-import dateutil.parser
 import requests
 from bs4 import BeautifulSoup
 from py7zr import SevenZipFile

 from constants import DATA_DIR, TWEETS_DIR, USER_DIR
-from utils import *
+from utils import read, debug, write, json_stringify


 class ProcessedUser(NamedTuple):
@@ -1,6 +1,7 @@
 """
 This module generates report HTML and serves it in an HTTP server.
 """
+
 import json
 import os.path
 import shutil
@@ -1,4 +1,5 @@
-"""This module contains useful functions and classes, including:
+"""
+This module contains useful functions and classes, including:
 - debug messages
 - file I/O
 - statistics functions, removing outliers and averaging values over a period
@@ -14,7 +15,7 @@ import statistics
 from dataclasses import dataclass
 from datetime import datetime, date, timedelta
 from pathlib import Path
-from typing import Union, NamedTuple, Any, Generator
+from typing import Union, Any, Generator

 import json5
 import numpy as np
@@ -49,7 +50,8 @@ class Config:

 def load_config(path: str = 'config.json5') -> Config:
    """
-    Load config using JSON5, from either the local file ~/config.json5 or from the environment variable named config.
+    Load config using JSON5, from either the local file ~/config.json5 or from the environment
+    variable named config.

    :param path: Path of the config file (Default: config.json5)
    :return: Config object
@@ -242,6 +244,7 @@ def tabulate_stats(stats: list[Stats], percent: bool = False) -> list[list[str]]
    :param percent: Whether the numbers are percentages
    :return: Table for tabulate
    """
+
    def num(n: float) -> str:
        return f'{n:.2f}' if not percent else f'{n * 100:.1f}%'

@@ -384,6 +387,7 @@ class EnhancedJSONEncoder(json.JSONEncoder):
    An improvement to the json.JSONEncoder class, which supports:
    encoding for dataclasses, encoding for datetime, and sets
    """
+
    def default(self, o):

        # Support encoding dataclasses
@@ -1,18 +1,25 @@
 """
-This module uses matplotlib to visualize processed data as graphs. The results are stored in report directory.
+This module uses matplotlib to visualize processed data as graphs. The results are stored in
+report directory.
 The graphs are created after processing the data, for example with filtering and removing outliers.
 """
-import os.path
-from typing import Optional

+import os.path
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Union
+
+import matplotlib.dates as mdates
 import matplotlib.ticker
 import scipy.signal
 from matplotlib import pyplot as plt, font_manager
-import matplotlib.dates as mdates

-from constants import RES_DIR
-from processing import *
 from collect_others import get_covid_cases_us
+from constants import RES_DIR, REPORT_DIR
+from processing import load_tweets, load_user_sample
+from utils import debug, daterange, map_to_dates, filter_days_avg, Reporter, remove_outliers, \
+    tabulate_stats, get_statistics


@dataclass()
@@ -163,7 +170,8 @@ class Sample:
            popularity.append(UserFloat(u, covid_pop_avg / all_pop_avg))

        # Calculate frequency on date
-        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in date_covid_count}
+        self.date_covid_freq = {d: date_covid_count[d] / date_all_count[d] for d in
+                                date_covid_count}

        # Sort by relative popularity or frequency
        popularity.sort(key=lambda x: x.data, reverse=True)
@@ -244,7 +252,7 @@ def load_samples() -> list[Sample]:
    keys = ['en', 'zh', 'ja']
    pop_lang = [u.lang for u in users.most_popular]
    rand_lang = [u.lang for u in users.random]
-    Reporter('sample-demographics.md')\
+    Reporter('sample-demographics.md') \
        .table([['`500-pop`'] + [str(len(pop_lang))] + [str(pop_lang.count(k)) for k in keys],
                ['`500-rand`'] + [str(len(rand_lang))] + [str(rand_lang.count(k)) for k in keys]],
               ['Total', 'English', 'Chinese', 'Japanese'], False)
@@ -512,7 +520,7 @@ def report_all() -> None:
    Generate all reports
    
    Preconditions:
-        - Report has been 
+        - Twitter data have been downloaded and processed.
    """
    graph_load_font()