diff --git a/collect/twitter.py b/collect/twitter.py index aa0b4c1..b6ff099 100644 --- a/collect/twitter.py +++ b/collect/twitter.py @@ -1,9 +1,13 @@ +import json from dataclasses import dataclass from datetime import datetime +from pathlib import Path from typing import Union +import pytz import tweepy from tweepy import API +from tweepy.models import Status from collect.utils import Config, debug @@ -68,23 +72,43 @@ def tweepy_login(conf: Config) -> tweepy.API: return api -def get_user_tweets(api: API, screen_name: str) -> list[Tweet]: +def download_user_tweets(api: API, screen_name: str) -> None: """ - Get all tweets from a specific individual + Download all tweets from a specific individual to a local folder :param api: Tweepy API object :param screen_name: Screen name of that individual - :return: All tweets + :return: None """ debug(f'Getting user tweets for {screen_name}') + start_date = pytz.UTC.localize(datetime(2020, 1, 1)) - tweets: list[Tweet] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended') + # Get initial 200 tweets + tweets: list[Status] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True) + + # Get additional tweets while True: debug(f'- Got {len(tweets)} tweets, getting additional tweets...') - additional_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', max_id=int(tweets[-1].id_str) - 1) + additional_tweets: list[Status] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True, + max_id=int(tweets[-1].id_str) - 1) if len(additional_tweets) == 0: + debug(f'- Got {len(tweets)} tweets, finished because no more tweets are available.') break + + if additional_tweets[-1].created_at < start_date: + debug(f'- Got {len(tweets)} tweets, finished because the earliest tweet in the dataset goes before 2020-01-01.') + break + tweets.extend(additional_tweets) - debug(f'- Got {len(tweets)} tweets, finished.') - return tweets + + # Make directory + dir = './data/raw_twitter_users/' + Path(dir).mkdir(parents=True, exist_ok=True) + + # Encode json + json_output = json.dumps([t._json for t in tweets], indent=4) + + # Store in file + with open(dir + screen_name + '.json', 'w') as f: + f.write(json_output)