From f3760f6c6bc44c856b9c820af539e33197a633e5 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Sun, 21 Nov 2021 22:13:16 -0500 Subject: [PATCH] [+] Create download_users function signature --- src/raw_collect/twitter.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/raw_collect/twitter.py b/src/raw_collect/twitter.py index 1da532e..0598d37 100644 --- a/src/raw_collect/twitter.py +++ b/src/raw_collect/twitter.py @@ -1,4 +1,5 @@ import json +import math from dataclasses import dataclass from datetime import datetime from pathlib import Path @@ -6,10 +7,8 @@ from typing import Union import pytz import tweepy -from tweepy import API -from tweepy.models import Status - from collect.utils import Config, debug, Posting, json_stringify, load_config +from tweepy import API @dataclass @@ -85,20 +84,23 @@ def download_user_tweets(api: API, screen_name: str) -> None: start_date = pytz.UTC.localize(datetime(2020, 1, 1)) # Get initial 200 tweets - tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True) + tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', + trim_user=True) postings = [convert_to_generic(screen_name, t) for t in tweets] # Get additional tweets while True: debug(f'- Got {len(tweets)} tweets, getting additional tweets...') - additional_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True, - max_id=int(tweets[-1].id_str) - 1) + additional_tweets = api.user_timeline(screen_name=screen_name, count=200, + tweet_mode='extended', trim_user=True, + max_id=int(tweets[-1].id_str) - 1) if len(additional_tweets) == 0: debug(f'- Got {len(tweets)} tweets, finished because no more tweets are available.') break if additional_tweets[-1].created_at < start_date: - debug(f'- Got {len(tweets)} tweets, finished because the earliest tweet in the dataset goes before 2020-01-01.') + debug( + f'- Got {len(tweets)} tweets, finished because the earliest tweet in the dataset goes before 2020-01-01.') break tweets.extend(additional_tweets) @@ -117,15 +119,29 @@ def download_user_tweets(api: API, screen_name: str) -> None: f.write(json_stringify(postings)) -def download_followings_chain(start_point: str, n: int): +def download_users(start_point: str, n: float = math.inf, rate_limit: int = 10) -> None: """ This function downloads n twitter users by using a followings-chain. + Since there isn't an API or a database with all twitter users, we can't obtain a strict list + of all twitter users, nor can we obtain a list of strictly random or most popular twitter + users. Therefore, we use the method of follows chaining: we start from a specific individual, + obtain their followers, and pick 6 random individuals from the followings list. Then, we repeat + the process for the selected followings: we pick 6 random followings of the 6 random followings + that we picked. + In reality, this method will be biased toward individuals that are worthy of following since we + are picking random followings. - :param start_point: - :param n: How many users do you want to download? - :return: + We will download all user data to /data/twitter/user/.json + + Then, we can obtain a list of all users we have downloaded just by obtaining a list of all + files under this directory. + + :param start_point: Starting user's screen name. + :param n: How many users do you want to download? (Set to infinity if you want all the data) + :param rate_limit: The maximum number of requests per minute. + :return: None """