From 1d5b38d45f4b6c9cf7449274afe616bd89ef33bb Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Tue, 23 Nov 2021 19:33:26 -0500 Subject: [PATCH] [O] Check file exists when generating sample --- src/main.py | 28 ++++++++++++++-------------- src/process/twitter_process.py | 10 ++++++++-- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/main.py b/src/main.py index e9f9b04..1ffed15 100644 --- a/src/main.py +++ b/src/main.py @@ -30,30 +30,30 @@ if __name__ == '__main__': # criteria as our sample. select_user_sample() + # Just curious, who are the 20 most popular individuals on twitter? + # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]), + # headers=['Name', 'Followers'])) + ##################### # Data collection - Step C2.1 # (After step P2) Load the downloaded twitter users by popularity, and start downloading all - # tweets from 500 of the most popular users. - # sample = load_user_sample() - - # Just curious, who are the 20 most popular individuals on twitter? - # print(tabulate(((u.username, u.popularity) for u in sample.most_popular[:20]), - # headers=['Name', 'Followers'])) - - # Start download - # for u in sample.most_popular: + # tweets from 500 of the most popular users. Takes around 2 hours. + # for u in load_user_sample().most_popular: # download_all_tweets(api, u.username) ##################### # Data collection - Step C2.2 - # (After step P2) Download all tweets from the 500 randomly selected users - for u in load_user_sample().random: - download_all_tweets(api, u.username) + # (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours + # for u in load_user_sample().random: + # download_all_tweets(api, u.username) + + for u in os.listdir('./data/twitter/user-tweets/user'): + ##################### - # Data processing - Step P2 + # Data processing - Step P3 # (After step C2) Process the downloaded tweets, determine whether they are covid-related - process_tweets() + # process_tweets() # Who posted the most covid tweets? (covid vs non-covid ratio) # - Graph histogram of this ratio diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index 14acf6a..6777c73 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -116,6 +116,13 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None: :return: None """ user_dir = normalize_directory(user_dir) + file = f'{user_dir}/processed/sample.json' + + # Exists + if os.path.isfile(file): + debug(f'There is already a sample generated at {file}. If you want to reselect the' + f'sample, please delete the existing sample file.') + return # Load users users = load_users(user_dir) @@ -136,8 +143,7 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None: sample = random.sample(filtered, 500) # Save - write(f'{user_dir}/processed/sample.json', - json_stringify(Sample(most_popular, sample))) + write(file, json_stringify(Sample(most_popular, sample))) def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample: