60 lines
2.3 KiB
Python
60 lines
2.3 KiB
Python
from tabulate import tabulate
|
|
|
|
from process.twitter_process import *
|
|
from raw_collect.twitter import *
|
|
from utils import *
|
|
|
|
if __name__ == '__main__':
|
|
# Load config and create API
|
|
conf = load_config('config.json5')
|
|
api = tweepy_login(conf)
|
|
|
|
#####################
|
|
# Data collection - Step C1
|
|
# Download a wide range of users from Twitter using follow-chaining starting from a single user.
|
|
# download_users_start(api, 'voxdotcom')
|
|
|
|
# This task will run for a very very long time to obtain a large dataset of twitter users. If
|
|
# you want to stop the process, you can resume it later using the following line:
|
|
# download_users_resume_progress(api)
|
|
|
|
#####################
|
|
# Data processing - Step P1
|
|
# (After step C1) Process the downloaded twitter users, extract screen name, popularity, and
|
|
# number of tweets data.
|
|
# process_users()
|
|
|
|
#####################
|
|
# Data processing - Step P2
|
|
# (After step P1) Select 500 most popular users and 500 random users who meet a particular
|
|
# criteria as our sample.
|
|
# select_user_sample()
|
|
|
|
# Just curious, who are the 20 most popular individuals on twitter?
|
|
# print(tabulate(((u.username, u.popularity) for u in load_user_sample().most_popular[:20]),
|
|
# headers=['Name', 'Followers']))
|
|
|
|
#####################
|
|
# Data collection - Step C2.1
|
|
# (After step P2) Load the downloaded twitter users by popularity, and start downloading all
|
|
# tweets from 500 of the most popular users. Takes around 2 hours.
|
|
# for u in load_user_sample().most_popular:
|
|
# download_all_tweets(api, u.username)
|
|
|
|
#####################
|
|
# Data collection - Step C2.2
|
|
# (After step P2) Download all tweets from the 500 randomly selected users, takes around 2 hours
|
|
# for u in load_user_sample().random:
|
|
# download_all_tweets(api, u.username)
|
|
|
|
#####################
|
|
# Data processing - Step P3
|
|
# (After step C2) Process the downloaded tweets, determine whether they are covid-related
|
|
# process_tweets()
|
|
|
|
# Who posted the most covid tweets? (covid vs non-covid ratio)
|
|
# - Graph histogram of this ratio
|
|
# Who has the most covid tweet popularity (popularity of covid vs non-covid tweets ratio)
|
|
# - Graph histogram of this ratio
|
|
|