[+] Script to remove tweets not in sample

This commit is contained in:
Hykilpikonna
2021-11-23 19:51:36 -05:00
parent 1d5b38d45f
commit f64dd2d95f
+19 -2
View File
@@ -28,7 +28,7 @@ if __name__ == '__main__':
# Data processing - Step P2
# (After step P1) Select 500 most popular users and 500 random users who meet a particular
# criteria as our sample.
select_user_sample()
# select_user_sample()
# Just curious, who are the 20 most popular individuals on twitter?
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
@@ -47,8 +47,25 @@ if __name__ == '__main__':
# for u in load_user_sample().random:
# download_all_tweets(api, u.username)
for u in os.listdir('./data/twitter/user-tweets/user'):
sample = load_user_sample()
names = {v.username for v in sample.random}
names = names.union({v.username for v in sample.most_popular})
remove = set()
for file in os.listdir('./data/twitter/user-tweets/user'):
u = file.replace('.json', '')
if all(p.username != u for p in sample.most_popular) and all(p.username != u for p in sample.random):
remove.add(u)
print(len(remove))
print(len(os.listdir('./data/twitter/user-tweets/user')))
for file in remove:
os.remove(f'./data/twitter/user-tweets/user/{file}.json')
os.remove(f'./data/twitter/user-tweets/processed/{file}.json')
# print(len(sample.pop))
# print(json_stringify(sample.random))
#####################
# Data processing - Step P3