From 1d5b38d45f4b6c9cf7449274afe616bd89ef33bb Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Tue, 23 Nov 2021 19:33:26 -0500
Subject: [PATCH] [O] Check file exists when generating sample

---
 src/main.py                    | 28 ++++++++++++++--------------
 src/process/twitter_process.py | 10 ++++++++--
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/main.py b/src/main.py
index e9f9b04..1ffed15 100644
--- a/src/main.py
+++ b/src/main.py
@@ -30,30 +30,30 @@ if __name__ == '__main__':
     # criteria as our sample.
     select_user_sample()
 
+    # Just curious, who are the 20 most popular individuals on twitter?
+    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
+    #                headers=['Name', 'Followers']))
+
     #####################
     # Data collection - Step C2.1
     # (After step P2) Load the downloaded twitter users by popularity, and start downloading all
-    # tweets from 500 of the most popular users.
-    # sample = load_user_sample()
-
-    # Just curious, who are the 20 most popular individuals on twitter?
-    # print(tabulate(((u.username, u.popularity) for u in sample.most_popular[:20]),
-    #                headers=['Name', 'Followers']))
-
-    # Start download
-    # for u in sample.most_popular:
+    # tweets from 500 of the most popular users. Takes around 2 hours.
+    # for u in load_user_sample().most_popular:
     #     download_all_tweets(api, u.username)
 
     #####################
     # Data collection - Step C2.2
-    # (After step P2) Download all tweets from the 500 randomly selected users
-    for u in load_user_sample().random:
-        download_all_tweets(api, u.username)
+    # (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
+    # for u in load_user_sample().random:
+    #     download_all_tweets(api, u.username)
+
+    for u in os.listdir('./data/twitter/user-tweets/user'):
+
 
     #####################
-    # Data processing - Step P2
+    # Data processing - Step P3
     # (After step C2) Process the downloaded tweets, determine whether they are covid-related
-    process_tweets()
+    # process_tweets()
 
     # Who posted the most covid tweets? (covid vs non-covid ratio)
     # - Graph histogram of this ratio
diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py
index 14acf6a..6777c73 100644
--- a/src/process/twitter_process.py
+++ b/src/process/twitter_process.py
@@ -116,6 +116,13 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
     :return: None
     """
     user_dir = normalize_directory(user_dir)
+    file = f'{user_dir}/processed/sample.json'
+
+    # Exists
+    if os.path.isfile(file):
+        debug(f'There is already a sample generated at {file}. If you want to reselect the'
+              f'sample, please delete the existing sample file.')
+        return
 
     # Load users
     users = load_users(user_dir)
@@ -136,8 +143,7 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
     sample = random.sample(filtered, 500)
 
     # Save
-    write(f'{user_dir}/processed/sample.json',
-          json_stringify(Sample(most_popular, sample)))
+    write(file, json_stringify(Sample(most_popular, sample)))
 
 
 def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample: