[+] Download tweets function
This commit is contained in:
+31
-7
@@ -1,9 +1,13 @@
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import pytz
|
||||
import tweepy
|
||||
from tweepy import API
|
||||
from tweepy.models import Status
|
||||
|
||||
from collect.utils import Config, debug
|
||||
|
||||
@@ -68,23 +72,43 @@ def tweepy_login(conf: Config) -> tweepy.API:
|
||||
return api
|
||||
|
||||
|
||||
def get_user_tweets(api: API, screen_name: str) -> list[Tweet]:
|
||||
def download_user_tweets(api: API, screen_name: str) -> None:
|
||||
"""
|
||||
Get all tweets from a specific individual
|
||||
Download all tweets from a specific individual to a local folder
|
||||
|
||||
:param api: Tweepy API object
|
||||
:param screen_name: Screen name of that individual
|
||||
:return: All tweets
|
||||
:return: None
|
||||
"""
|
||||
debug(f'Getting user tweets for {screen_name}')
|
||||
start_date = pytz.UTC.localize(datetime(2020, 1, 1))
|
||||
|
||||
tweets: list[Tweet] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended')
|
||||
# Get initial 200 tweets
|
||||
tweets: list[Status] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True)
|
||||
|
||||
# Get additional tweets
|
||||
while True:
|
||||
debug(f'- Got {len(tweets)} tweets, getting additional tweets...')
|
||||
additional_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', max_id=int(tweets[-1].id_str) - 1)
|
||||
additional_tweets: list[Status] = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', trim_user=True,
|
||||
max_id=int(tweets[-1].id_str) - 1)
|
||||
if len(additional_tweets) == 0:
|
||||
debug(f'- Got {len(tweets)} tweets, finished because no more tweets are available.')
|
||||
break
|
||||
|
||||
if additional_tweets[-1].created_at < start_date:
|
||||
debug(f'- Got {len(tweets)} tweets, finished because the earliest tweet in the dataset goes before 2020-01-01.')
|
||||
break
|
||||
|
||||
tweets.extend(additional_tweets)
|
||||
|
||||
debug(f'- Got {len(tweets)} tweets, finished.')
|
||||
return tweets
|
||||
|
||||
# Make directory
|
||||
dir = './data/raw_twitter_users/'
|
||||
Path(dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Encode json
|
||||
json_output = json.dumps([t._json for t in tweets], indent=4)
|
||||
|
||||
# Store in file
|
||||
with open(dir + screen_name + '.json', 'w') as f:
|
||||
f.write(json_output)
|
||||
|
||||
Reference in New Issue
Block a user