From 51366074bf20e7ab6fc4d7ea2c82a302a4b6e1bf Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Sun, 21 Nov 2021 23:44:22 -0500 Subject: [PATCH] [O] Separate resume function --- src/raw_collect/twitter.py | 51 +++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/src/raw_collect/twitter.py b/src/raw_collect/twitter.py index 9a23483..447eb5b 100644 --- a/src/raw_collect/twitter.py +++ b/src/raw_collect/twitter.py @@ -153,6 +153,45 @@ def download_users(api: API, start_point: str, n: float = math.inf, :return: None """ + # Set of all the downloaded users' screen names + downloaded = set() + + # The set of starting users that are queried. + done_set = set() + + # The set of starting users currently looping through + current_set = {start_point} + + # The next set of starting users + next_set = set() + + # Start download + download_users_resume(api, n, base_dir, rate_limit, downloaded, done_set, current_set, next_set) + + +def download_users_resume(api: API, n: float, base_dir: str, rate_limit: int, + downloaded: set[str], done_set: set[str], + current_set: set[str], next_set: set[str]) -> None: + """ + Resume download from the given parameters. The download method is defined in the document for + the download_users function. + + Resume functionality is necessary because twitter limits the rate of get friends list to 15 + requests in a 15-minute window, which is 1 request per minute, so it will take a long time to + gather enough data, so we don't want to have to start over from the beginning once something + goes wrong. + + :param api: Tweepy's API object + :param n: How many users do you want to download? + :param base_dir: The downloads folder + :param rate_limit: The maximum number of requests per minute + :param downloaded: Set of all the downloaded users' screen names + :param done_set: The set of starting users that are queried + :param current_set: The set of starting users currently looping through + :param next_set: The next set of starting users + :return: None + """ + # Ensure that basedir doesn't ends with / if base_dir == '': base_dir = '.' @@ -166,18 +205,6 @@ def download_users(api: API, start_point: str, n: float = math.inf, # Rate limit delay rate_delay = 1 / rate_limit * 60 + 0.1 - # Set of all the downloaded users' screen names - downloaded = set() - - # The set of starting users that are queried. - done_set = set() - - # The set of starting users currently looping through - current_set = {start_point} - - # The next set of starting users - next_set = set() - # Loop until there are enough users while len(downloaded) < n: # Take a screen name from the current list