[O] Check file exists when generating sample

2021-11-23 19:33:26 -05:00
parent e6dd8a17a5
commit 1d5b38d45f
2 changed files with 22 additions and 16 deletions
@@ -30,30 +30,30 @@ if __name__ == '__main__':
    # criteria as our sample.
    select_user_sample()

+    # Just curious, who are the 20 most popular individuals on twitter?
+    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
+    #                headers=['Name', 'Followers']))
+
    #####################
    # Data collection - Step C2.1
    # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
-    # tweets from 500 of the most popular users.
-    # sample = load_user_sample()
-
-    # Just curious, who are the 20 most popular individuals on twitter?
-    # print(tabulate(((u.username, u.popularity) for u in sample.most_popular[:20]),
-    #                headers=['Name', 'Followers']))
-
-    # Start download
-    # for u in sample.most_popular:
+    # tweets from 500 of the most popular users. Takes around 2 hours.
+    # for u in load_user_sample().most_popular:
    #     download_all_tweets(api, u.username)

    #####################
    # Data collection - Step C2.2
-    # (After step P2) Download all tweets from the 500 randomly selected users
-    for u in load_user_sample().random:
-        download_all_tweets(api, u.username)
+    # (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
+    # for u in load_user_sample().random:
+    #     download_all_tweets(api, u.username)
+
+    for u in os.listdir('./data/twitter/user-tweets/user'):
+

    #####################
-    # Data processing - Step P2
+    # Data processing - Step P3
    # (After step C2) Process the downloaded tweets, determine whether they are covid-related
-    process_tweets()
+    # process_tweets()

    # Who posted the most covid tweets? (covid vs non-covid ratio)
    # - Graph histogram of this ratio
@@ -116,6 +116,13 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
    :return: None
    """
    user_dir = normalize_directory(user_dir)
+    file = f'{user_dir}/processed/sample.json'
+
+    # Exists
+    if os.path.isfile(file):
+        debug(f'There is already a sample generated at {file}. If you want to reselect the'
+              f'sample, please delete the existing sample file.')
+        return

    # Load users
    users = load_users(user_dir)
@@ -136,8 +143,7 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
    sample = random.sample(filtered, 500)

    # Save
-    write(f'{user_dir}/processed/sample.json',
-          json_stringify(Sample(most_popular, sample)))
+    write(file, json_stringify(Sample(most_popular, sample)))


 def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample: