[O] Check file exists when generating sample
This commit is contained in:
+14
-14
@@ -30,30 +30,30 @@ if __name__ == '__main__':
|
||||
# criteria as our sample.
|
||||
select_user_sample()
|
||||
|
||||
# Just curious, who are the 20 most popular individuals on twitter?
|
||||
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
|
||||
# headers=['Name', 'Followers']))
|
||||
|
||||
#####################
|
||||
# Data collection - Step C2.1
|
||||
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
|
||||
# tweets from 500 of the most popular users.
|
||||
# sample = load_user_sample()
|
||||
|
||||
# Just curious, who are the 20 most popular individuals on twitter?
|
||||
# print(tabulate(((u.username, u.popularity) for u in sample.most_popular[:20]),
|
||||
# headers=['Name', 'Followers']))
|
||||
|
||||
# Start download
|
||||
# for u in sample.most_popular:
|
||||
# tweets from 500 of the most popular users. Takes around 2 hours.
|
||||
# for u in load_user_sample().most_popular:
|
||||
# download_all_tweets(api, u.username)
|
||||
|
||||
#####################
|
||||
# Data collection - Step C2.2
|
||||
# (After step P2) Download all tweets from the 500 randomly selected users
|
||||
for u in load_user_sample().random:
|
||||
download_all_tweets(api, u.username)
|
||||
# (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
|
||||
# for u in load_user_sample().random:
|
||||
# download_all_tweets(api, u.username)
|
||||
|
||||
for u in os.listdir('./data/twitter/user-tweets/user'):
|
||||
|
||||
|
||||
#####################
|
||||
# Data processing - Step P2
|
||||
# Data processing - Step P3
|
||||
# (After step C2) Process the downloaded tweets, determine whether they are covid-related
|
||||
process_tweets()
|
||||
# process_tweets()
|
||||
|
||||
# Who posted the most covid tweets? (covid vs non-covid ratio)
|
||||
# - Graph histogram of this ratio
|
||||
|
||||
@@ -116,6 +116,13 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
|
||||
:return: None
|
||||
"""
|
||||
user_dir = normalize_directory(user_dir)
|
||||
file = f'{user_dir}/processed/sample.json'
|
||||
|
||||
# Exists
|
||||
if os.path.isfile(file):
|
||||
debug(f'There is already a sample generated at {file}. If you want to reselect the'
|
||||
f'sample, please delete the existing sample file.')
|
||||
return
|
||||
|
||||
# Load users
|
||||
users = load_users(user_dir)
|
||||
@@ -136,8 +143,7 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
|
||||
sample = random.sample(filtered, 500)
|
||||
|
||||
# Save
|
||||
write(f'{user_dir}/processed/sample.json',
|
||||
json_stringify(Sample(most_popular, sample)))
|
||||
write(file, json_stringify(Sample(most_popular, sample)))
|
||||
|
||||
|
||||
def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample:
|
||||
|
||||
Reference in New Issue
Block a user