From 181a89688b2ca4ec4f712a24adff55d38d744217 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Sun, 21 Nov 2021 22:52:33 -0500 Subject: [PATCH] [+] Write meta info --- src/raw_collect/twitter.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/raw_collect/twitter.py b/src/raw_collect/twitter.py index 3f064c4..635bc29 100644 --- a/src/raw_collect/twitter.py +++ b/src/raw_collect/twitter.py @@ -1,3 +1,4 @@ +import io import json import math import random @@ -121,7 +122,7 @@ def download_user_tweets(api: API, screen_name: str) -> None: def download_users(start_point: str, n: float = math.inf, - base_dir: str = '/data/twitter/user/', + base_dir: str = './data/twitter/user/', rate_limit: int = 10) -> None: """ This function downloads n twitter users by using a friends-chain. @@ -136,23 +137,25 @@ def download_users(start_point: str, n: float = math.inf, In reality, this method will be biased toward individuals that are worthy of following since "friends" are the list of users that someone followed. - We will download all user data to /data/twitter/user/.json + We will download all user data to ./data/twitter/user/users/.json + + We will save meta info to ./data/twitter/user/meta/ Then, we can obtain a list of all users we have downloaded just by obtaining a list of all files under this directory. :param start_point: Starting user's screen name. :param n: How many users do you want to download? (Default: math.inf) - :param base_dir: The downloads folder (Default: "/data/twitter/user/") + :param base_dir: The downloads folder (Default: "./data/twitter/user/") :param rate_limit: The maximum number of requests per minute. (Default: 10) :return: None """ - # Ensure that basedir ends with / + # Ensure that basedir doesn't ends with / if base_dir == '': base_dir = '.' - if not base_dir.endswith('/'): - base_dir += '/' + if base_dir.endswith('/'): + base_dir = base_dir[:-1] # Set of all the downloaded users' screen names downloaded = set() @@ -179,7 +182,7 @@ def download_users(start_point: str, n: float = math.inf, # This user was not saved, save the user. if user not in downloaded: # Save user json - with open(base_dir + user.screen_name + '.json', 'w') as f: + with open(f'{base_dir}/users/{user.screen_name}.json', 'w', encoding='utf-8') as f: f.write(json.dumps(user._json)) # Add to set @@ -214,6 +217,12 @@ def download_users(start_point: str, n: float = math.inf, current_set = next_set next_set = set() + # Update meta info so that downloading can be continued + with open(f'{base_dir}/meta/meta.json', 'w', encoding='utf-8') as f: + meta = {downloaded: downloaded, done_set: done_set, + current_set: current_set, next_set: next_set} + f.write(json.dumps(meta)) + def convert_to_generic(username: str, tweet: Tweet) -> Posting: """