[O] Check file exists when generating sample

This commit is contained in:
Hykilpikonna
2021-11-23 19:33:26 -05:00
parent e6dd8a17a5
commit 1d5b38d45f
2 changed files with 22 additions and 16 deletions
+14 -14
View File
@@ -30,30 +30,30 @@ if __name__ == '__main__':
# criteria as our sample.
select_user_sample()
# Just curious, who are the 20 most popular individuals on twitter?
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
# headers=['Name', 'Followers']))
#####################
# Data collection - Step C2.1
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
# tweets from 500 of the most popular users.
# sample = load_user_sample()
# Just curious, who are the 20 most popular individuals on twitter?
# print(tabulate(((u.username, u.popularity) for u in sample.most_popular[:20]),
# headers=['Name', 'Followers']))
# Start download
# for u in sample.most_popular:
# tweets from 500 of the most popular users. Takes around 2 hours.
# for u in load_user_sample().most_popular:
# download_all_tweets(api, u.username)
#####################
# Data collection - Step C2.2
# (After step P2) Download all tweets from the 500 randomly selected users
for u in load_user_sample().random:
download_all_tweets(api, u.username)
# (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
# for u in load_user_sample().random:
# download_all_tweets(api, u.username)
for u in os.listdir('./data/twitter/user-tweets/user'):
#####################
# Data processing - Step P2
# Data processing - Step P3
# (After step C2) Process the downloaded tweets, determine whether they are covid-related
process_tweets()
# process_tweets()
# Who posted the most covid tweets? (covid vs non-covid ratio)
# - Graph histogram of this ratio
+8 -2
View File
@@ -116,6 +116,13 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
:return: None
"""
user_dir = normalize_directory(user_dir)
file = f'{user_dir}/processed/sample.json'
# Exists
if os.path.isfile(file):
debug(f'There is already a sample generated at {file}. If you want to reselect the'
f'sample, please delete the existing sample file.')
return
# Load users
users = load_users(user_dir)
@@ -136,8 +143,7 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
sample = random.sample(filtered, 500)
# Save
write(f'{user_dir}/processed/sample.json',
json_stringify(Sample(most_popular, sample)))
write(file, json_stringify(Sample(most_popular, sample)))
def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample: