[O] Make rate limit static

This commit is contained in:
Hykilpikonna
2021-11-22 11:51:28 -05:00
parent a95374302a
commit 4099b2d4fd
3 changed files with 36 additions and 17 deletions
+9 -2
View File
@@ -21,13 +21,20 @@ if __name__ == '__main__':
#####################
# Data processing - Step P1
# (After step C1) Process the downloaded twitter users by popularity
users = process_users_popularity()
# users = process_users_popularity()
#####################
# Data collection - Step C2
# (After step P1) Load the downloaded twitter users by popularity, and start downloading as many
# tweets from these users as possible.
users = load_users_popularity()
# Just curious, who are the 20 most popular individuals on twitter?
print(tabulate(((u.username, u.popularity) for u in users[:20]), headers=['Name', 'Followers']))
# Start download
#####################
# Data collection - Step C2
# Download as many posts of the most popular individuals as possible.
+17 -15
View File
@@ -15,7 +15,8 @@ import tweepy
from tweepy import API, TooManyRequests, User
from process.twitter_process import Posting
from utils import Config, debug, json_stringify, load_config, normalize_directory
from utils import Config, debug, json_stringify, load_config, normalize_directory, \
calculate_rate_delay
@dataclass
@@ -59,14 +60,18 @@ def tweepy_login(conf: Config) -> tweepy.API:
return api
def download_user_tweets(api: API, screen_name: str) -> None:
def download_user_tweets(api: API, screen_name: str,
base_dir: str = './data/twitter/user-tweets/') -> None:
"""
Download all tweets from a specific individual to a local folder
:param api: Tweepy API object
:param screen_name: Screen name of that individual
:param base_dir: The downloads folder (Default: "./data/twitter/user-tweets/")
:return: None
"""
base_dir = normalize_directory(base_dir)
debug(f'Getting user tweets for {screen_name}')
# Get initial 200 tweets
@@ -101,8 +106,7 @@ def download_user_tweets(api: API, screen_name: str) -> None:
def download_users_start(api: API, start_point: str, n: float = math.inf,
base_dir: str = './data/twitter/user/',
rate_limit: int = 1) -> None:
base_dir: str = './data/twitter/user/') -> None:
"""
This function downloads n twitter users by using a friends-chain.
@@ -141,7 +145,6 @@ def download_users_start(api: API, start_point: str, n: float = math.inf,
:param start_point: Starting user's screen name.
:param n: How many users do you want to download? (Default: math.inf)
:param base_dir: The downloads folder (Default: "./data/twitter/user/")
:param rate_limit: The maximum number of requests per minute. (Default: 1)
:return: None
"""
@@ -158,7 +161,7 @@ def download_users_start(api: API, start_point: str, n: float = math.inf,
next_set = set()
# Start download
download_users_execute(api, n, base_dir, rate_limit, downloaded,
download_users_execute(api, n, base_dir, downloaded,
done_set, current_set, next_set)
@@ -175,12 +178,12 @@ def download_users_resume_progress(api: API, base_dir: str = './data/twitter/use
meta = json.load(f)
# Resume
download_users_execute(api, meta['n'], base_dir, meta['rate_limit'],
download_users_execute(api, meta['n'], base_dir,
set(meta['downloaded']), set(meta['done_set']),
set(meta['current_set']), set(meta['next_set']))
def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int,
def download_users_execute(api: API, n: float, base_dir: str,
downloaded: set[str], done_set: set[str],
current_set: set[str], next_set: set[str]) -> None:
"""
@@ -195,7 +198,6 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int,
:param api: Tweepy's API object
:param n: How many users do you want to download?
:param base_dir: The downloads folder
:param rate_limit: The maximum number of requests per minute
:param downloaded: Set of all the downloaded users' screen names
:param done_set: The set of starting users that are queried
:param current_set: The set of starting users currently looping through
@@ -208,15 +210,16 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int,
Path(f'{base_dir}/users').mkdir(parents=True, exist_ok=True)
Path(f'{base_dir}/meta').mkdir(parents=True, exist_ok=True)
# Rate limit delay
rate_delay = 1 / rate_limit * 60 + 1
# Rate limit for this API endpoint is 1 request per minute, and rate delay defines how many
# seconds to sleep for each request.
rate_delay = calculate_rate_delay(1)
print("Executing friends-chain download:")
print(f"- n: {n}")
print(f"- Requests per minute: {rate_limit}")
print(f"- Requests per minute: 1")
print(f"- Directory: {base_dir}")
print(f"- Downloaded: {len(downloaded)}")
print(f"- Current search set: {current_set}")
print(f"- Current search set: {len(current_set)}")
print(f"- Next search set: {len(next_set)}")
print()
@@ -281,8 +284,7 @@ def download_users_execute(api: API, n: float, base_dir: str, rate_limit: int,
# Update meta info so that downloading can be continued
with open(f'{base_dir}/meta/meta.json', 'w', encoding='utf-8') as f:
meta = {'downloaded': downloaded, 'done_set': done_set,
'current_set': current_set, 'next_set': next_set,
'n': n, 'rate_limit': rate_limit}
'current_set': current_set, 'next_set': next_set, 'n': n}
f.write(json_stringify(meta, indent=None))
debug(f'Finished saving friends of {screen_name}')
+10
View File
@@ -80,6 +80,16 @@ def normalize_directory(directory: str) -> str:
return directory
def calculate_rate_delay(rate_limit: float) -> float:
"""
Calculate the rate delay for each request given rate limit in request per minute
:param rate_limit: Rate limit in requests per minute
:return: Rate delay in seconds per request (added one second just to be safe)
"""
return 1 / rate_limit * 60 + 1
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):