diff --git a/src/main.py b/src/main.py index 1ffed15..ba308b5 100644 --- a/src/main.py +++ b/src/main.py @@ -28,7 +28,7 @@ if __name__ == '__main__': # Data processing - Step P2 # (After step P1) Select 500 most popular users and 500 random users who meet a particular # criteria as our sample. - select_user_sample() + # select_user_sample() # Just curious, who are the 20 most popular individuals on twitter? # print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]), @@ -47,8 +47,25 @@ if __name__ == '__main__': # for u in load_user_sample().random: # download_all_tweets(api, u.username) - for u in os.listdir('./data/twitter/user-tweets/user'): + sample = load_user_sample() + names = {v.username for v in sample.random} + names = names.union({v.username for v in sample.most_popular}) + remove = set() + for file in os.listdir('./data/twitter/user-tweets/user'): + u = file.replace('.json', '') + if all(p.username != u for p in sample.most_popular) and all(p.username != u for p in sample.random): + remove.add(u) + + print(len(remove)) + print(len(os.listdir('./data/twitter/user-tweets/user'))) + + for file in remove: + os.remove(f'./data/twitter/user-tweets/user/{file}.json') + os.remove(f'./data/twitter/user-tweets/processed/{file}.json') + + # print(len(sample.pop)) + # print(json_stringify(sample.random)) ##################### # Data processing - Step P3