Files
CSC110-Project/src/main.py
T
Hykilpikonna 4e265bf30d [O] Clean code
2021-11-23 19:51:58 -05:00

60 lines
2.3 KiB
Python

from tabulate import tabulate
from process.twitter_process import *
from raw_collect.twitter import *
from utils import *
if __name__ == '__main__':
# Load config and create API
conf = load_config('config.json5')
api = tweepy_login(conf)
#####################
# Data collection - Step C1
# Download a wide range of users from Twitter using follow-chaining starting from a single user.
# download_users_start(api, 'voxdotcom')
# This task will run for a very very long time to obtain a large dataset of twitter users. If
# you want to stop the process, you can resume it later using the following line:
# download_users_resume_progress(api)
#####################
# Data processing - Step P1
# (After step C1) Process the downloaded twitter users, extract screen name, popularity, and
# number of tweets data.
# process_users()
#####################
# Data processing - Step P2
# (After step P1) Select 500 most popular users and 500 random users who meet a particular
# criteria as our sample.
# select_user_sample()
# Just curious, who are the 20 most popular individuals on twitter?
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
# headers=['Name', 'Followers']))
#####################
# Data collection - Step C2.1
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
# tweets from 500 of the most popular users. Takes around 2 hours.
# for u in load_user_sample().most_popular:
# download_all_tweets(api, u.username)
#####################
# Data collection - Step C2.2
# (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
# for u in load_user_sample().random:
# download_all_tweets(api, u.username)
#####################
# Data processing - Step P3
# (After step C2) Process the downloaded tweets, determine whether they are covid-related
# process_tweets()
# Who posted the most covid tweets? (covid vs non-covid ratio)
# - Graph histogram of this ratio
# Who has the most covid tweet popularity (popularity of covid vs non-covid tweets ratio)
# - Graph histogram of this ratio