From 4099b2d4fd0aa307ea717a1024a91f5a9dff9697 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Mon, 22 Nov 2021 11:51:28 -0500 Subject: [PATCH] [O] Make rate limit static --- src/main.py | 11 +++++++++-- src/raw_collect/twitter.py | 32 +++++++++++++++++--------------- src/utils.py | 10 ++++++++++ 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/main.py b/src/main.py index e42aff8..feec6e8 100644 --- a/src/main.py +++ b/src/main.py @@ -21,13 +21,20 @@ if __name__ == '__main__': ##################### # Data processing - Step P1 # (After step C1) Process the downloaded twitter users by popularity - users = process_users_popularity() - + # users = process_users_popularity() + ##################### + # Data collection - Step C2 + # (After step P1) Load the downloaded twitter users by popularity, and start downloading as many + # tweets from these users as possible. + users = load_users_popularity() # Just curious, who are the 20 most popular individuals on twitter? print(tabulate(((u.username, u.popularity) for u in users[:20]), headers=['Name', 'Followers'])) + # Start download + + ##################### # Data collection - Step C2 # Download as many posts of the most popular individuals as possible. diff --git a/src/raw_collect/twitter.py b/src/raw_collect/twitter.py index 2c1b3d6..e2f5019 100644 --- a/src/raw_collect/twitter.py +++ b/src/raw_collect/twitter.py @@ -15,7 +15,8 @@ import tweepy from tweepy import API, TooManyRequests, User from process.twitter_process import Posting -from utils import Config, debug, json_stringify, load_config, normalize_directory +from utils import Config, debug, json_stringify, load_config, normalize_directory, \ + calculate_rate_delay @dataclass @@ -59,14 +60,18 @@ def tweepy_login(conf: Config) -> tweepy.API: return api -def download_user_tweets(api: API, screen_name: str) -> None: +def download_user_tweets(api: API, screen_name: str, + base_dir: str = './data/twitter/user-tweets/') -> None: """ Download all tweets from a specific individual to a local folder :param api: Tweepy API object :param screen_name: Screen name of that individual + :param base_dir: The downloads folder (Default: "./data/twitter/user-tweets/") :return: None """ + base_dir = normalize_directory(base_dir) + debug(f'Getting user tweets for {screen_name}') # Get initial 200 tweets @@ -101,8 +106,7 @@ def download_user_tweets(api: API, screen_name: str) -> None: def download_users_start(api: API, start_point: str, n: float = math.inf, - base_dir: str = './data/twitter/user/', - rate_limit: int = 1) -> None: + base_dir: str = './data/twitter/user/') -> None: """ This function downloads n twitter users by using a friends-chain. @@ -141,7 +145,6 @@ def download_users_start(api: API, start_point: str, n: float = math.inf, :param start_point: Starting user's screen name. :param n: How many users do you want to download? (Default: math.inf) :param base_dir: The downloads folder (Default: "./data/twitter/user/") - :param rate_limit: The maximum number of requests per minute. (Default: 1) :return: None """ @@ -158,7 +161,7 @@ def download_users_start(api: API, start_point: str, n: float = math.inf, next_set = set() # Start download - download_users_execute(api, n, base_dir, rate_limit, downloaded, + download_users_execute(api, n, base_dir, downloaded, done_set, current_set, next_set) @@ -175,12 +178,12 @@ def download_users_resume_progress(api: API, base_dir: str = './data/twitter/use meta = json.load(f) # Resume - download_users_execute(api, meta['n'], base_dir, meta['rate_limit'], + download_users_execute(api, meta['n'], base_dir, set(meta['downloaded']), set(meta['done_set']), set(meta['current_set']), set(meta['next_set'])) -def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int, +def download_users_execute(api: API, n: float, base_dir: str, downloaded: set[str], done_set: set[str], current_set: set[str], next_set: set[str]) -> None: """ @@ -195,7 +198,6 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int, :param api: Tweepy's API object :param n: How many users do you want to download? :param base_dir: The downloads folder - :param rate_limit: The maximum number of requests per minute :param downloaded: Set of all the downloaded users' screen names :param done_set: The set of starting users that are queried :param current_set: The set of starting users currently looping through @@ -208,15 +210,16 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int, Path(f'{base_dir}/users').mkdir(parents=True, exist_ok=True) Path(f'{base_dir}/meta').mkdir(parents=True, exist_ok=True) - # Rate limit delay - rate_delay = 1 / rate_limit * 60 + 1 + # Rate limit for this API endpoint is 1 request per minute, and rate delay defines how many + # seconds to sleep for each request. + rate_delay = calculate_rate_delay(1) print("Executing friends-chain download:") print(f"- n: {n}") - print(f"- Requests per minute: {rate_limit}") + print(f"- Requests per minute: 1") print(f"- Directory: {base_dir}") print(f"- Downloaded: {len(downloaded)}") - print(f"- Current search set: {current_set}") + print(f"- Current search set: {len(current_set)}") print(f"- Next search set: {len(next_set)}") print() @@ -281,8 +284,7 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int, # Update meta info so that downloading can be continued with open(f'{base_dir}/meta/meta.json', 'w', encoding='utf-8') as f: meta = {'downloaded': downloaded, 'done_set': done_set, - 'current_set': current_set, 'next_set': next_set, - 'n': n, 'rate_limit': rate_limit} + 'current_set': current_set, 'next_set': next_set, 'n': n} f.write(json_stringify(meta, indent=None)) debug(f'Finished saving friends of {screen_name}') diff --git a/src/utils.py b/src/utils.py index 0f27fd3..fee03b0 100644 --- a/src/utils.py +++ b/src/utils.py @@ -80,6 +80,16 @@ def normalize_directory(directory: str) -> str: return directory +def calculate_rate_delay(rate_limit: float) -> float: + """ + Calculate the rate delay for each request given rate limit in request per minute + + :param rate_limit: Rate limit in requests per minute + :return: Rate delay in seconds per request (added one second just to be safe) + """ + return 1 / rate_limit * 60 + 1 + + class EnhancedJSONEncoder(json.JSONEncoder): def default(self, o):