[+] Script to remove tweets not in sample

2021-11-23 19:51:36 -05:00
parent 1d5b38d45f
commit f64dd2d95f
1 changed files with 19 additions and 2 deletions
@@ -28,7 +28,7 @@ if __name__ == '__main__':
    # Data processing - Step P2
    # (After step P1) Select 500 most popular users and 500 random users who meet a particular
    # criteria as our sample.
-    select_user_sample()
+    # select_user_sample()

    # Just curious, who are the 20 most popular individuals on twitter?
    # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
@@ -47,8 +47,25 @@ if __name__ == '__main__':
    # for u in load_user_sample().random:
    #     download_all_tweets(api, u.username)

-    for u in os.listdir('./data/twitter/user-tweets/user'):
+    sample = load_user_sample()
+    names = {v.username for v in sample.random}
+    names = names.union({v.username for v in sample.most_popular})

+    remove = set()
+    for file in os.listdir('./data/twitter/user-tweets/user'):
+        u = file.replace('.json', '')
+        if all(p.username != u for p in sample.most_popular) and all(p.username != u for p in sample.random):
+            remove.add(u)
+
+    print(len(remove))
+    print(len(os.listdir('./data/twitter/user-tweets/user')))
+
+    for file in remove:
+        os.remove(f'./data/twitter/user-tweets/user/{file}.json')
+        os.remove(f'./data/twitter/user-tweets/processed/{file}.json')
+
+    # print(len(sample.pop))
+    # print(json_stringify(sample.random))

    #####################
    # Data processing - Step P3