diff --git a/src/main.py b/src/main.py index 5edc1f7..d8c8a0e 100644 --- a/src/main.py +++ b/src/main.py @@ -7,9 +7,10 @@ from utils import * # Constants (The instructors said that we can use global constants here: # https://piazza.com/class/ksovzjrlsye72f?cid=1664 # They should not end with "/" -data_dir = './data' -tweets_dir = f'{data_dir}/twitter/user-tweets' -user_dir = f'{data_dir}/twitter/user' +DATA_DIR = './data' +TWEETS_DIR = f'{DATA_DIR}/twitter/user-tweets' +USER_DIR = f'{DATA_DIR}/twitter/user' + if __name__ == '__main__': # Load config and create API diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index ab3cb5d..c9064de 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from py7zr import SevenZipFile -from main import data_dir, tweets_dir, user_dir +from main import DATA_DIR, TWEETS_DIR, USER_DIR from utils import * @@ -44,11 +44,11 @@ def process_users() -> None: users = [] # Loop through all the files - for filename in os.listdir(f'{user_dir}/users'): + for filename in os.listdir(f'{USER_DIR}/users'): # Only check json files and ignore macos dot files if filename.endswith('.json') and not filename.startswith('.'): # Read - user = json.loads(read(f'{user_dir}/users/{filename}')) + user = json.loads(read(f'{USER_DIR}/users/{filename}')) # Get user language (The problem is, most people's lang field are null, so we have to # look at the language of their latest status as well, while they might not have a @@ -69,7 +69,7 @@ def process_users() -> None: users.sort(key=lambda x: x.popularity, reverse=True) # Save data - write(f'{user_dir}/processed/users.json', json_stringify(users)) + write(f'{USER_DIR}/processed/users.json', json_stringify(users)) def load_users() -> list[ProcessedUser]: @@ -78,7 +78,7 @@ def load_users() -> list[ProcessedUser]: :return: List of processed users, sorted descending by popularity. """ - return [ProcessedUser(*u) for u in json.loads(read(f'{user_dir}/processed/users.json'))] + return [ProcessedUser(*u) for u in json.loads(read(f'{USER_DIR}/processed/users.json'))] def get_user_popularity_ranking(user: str) -> int: @@ -116,7 +116,7 @@ def select_user_sample() -> None: :return: None """ - file = f'{user_dir}/processed/sample.json' + file = f'{USER_DIR}/processed/sample.json' # Exists if os.path.isfile(file): @@ -152,7 +152,7 @@ def load_user_sample() -> Sample: :return: None """ - j = json.loads(read(f'{user_dir}/processed/sample.json')) + j = json.loads(read(f'{USER_DIR}/processed/sample.json')) return Sample([ProcessedUser(*u) for u in j['most_popular']], [ProcessedUser(*u) for u in j['random']]) @@ -185,15 +185,15 @@ def process_tweets() -> None: :return: None """ # Loop through all the files - for filename in os.listdir(f'{tweets_dir}/user'): + for filename in os.listdir(f'{TWEETS_DIR}/user'): # Only check json files and ignore macos dot files if filename.endswith('.json') and not filename.startswith('.'): # Check if already processed - if os.path.isfile(f'{tweets_dir}/processed/{filename}'): + if os.path.isfile(f'{TWEETS_DIR}/processed/{filename}'): continue # Read - tweets = json.loads(read(f'{tweets_dir}/user/{filename}')) + tweets = json.loads(read(f'{TWEETS_DIR}/user/{filename}')) p = [Posting(is_covid_related(t['full_text']), t['favorite_count'] + t['retweet_count'], 'retweeted_status' in t, @@ -201,7 +201,7 @@ def process_tweets() -> None: for t in tweets] # Save data - write(f'{tweets_dir}/processed/{filename}', json_stringify(p)) + write(f'{TWEETS_DIR}/processed/{filename}', json_stringify(p)) debug(f'Processed: {filename}') @@ -213,7 +213,7 @@ def load_tweets(username: str) -> list[Posting]: :return: User's processed tweets """ return [Posting(*p) for p in json.loads(read( - os.path.join(tweets_dir, f'processed/{username}.json')))] + os.path.join(TWEETS_DIR, f'processed/{username}.json')))] def is_covid_related(text: str) -> bool: @@ -248,7 +248,7 @@ def pack_data() -> None: :return: None """ - packed_dir = f'{data_dir}/packed' + packed_dir = f'{DATA_DIR}/packed' Path(packed_dir).mkdir(parents=True, exist_ok=True) # Pack data for processed. @@ -259,4 +259,4 @@ def pack_data() -> None: z: SevenZipFile = z for p in processed_dirs: debug(f'- Packing {p}') - z.writeall(data_dir + p) + z.writeall(DATA_DIR + p) diff --git a/src/raw_collect/twitter.py b/src/raw_collect/twitter.py index c5b95d9..40aeda5 100644 --- a/src/raw_collect/twitter.py +++ b/src/raw_collect/twitter.py @@ -9,7 +9,7 @@ from typing import List import tweepy from tweepy import API, TooManyRequests, User, Tweet, Unauthorized -from main import tweets_dir, user_dir +from main import TWEETS_DIR, USER_DIR from utils import * @@ -65,7 +65,7 @@ def download_all_tweets(api: API, screen_name: str, :return: None """ # Ensure directories exist - file = f'{tweets_dir}/user/{screen_name}.json' + file = f'{TWEETS_DIR}/user/{screen_name}.json' # Check if user already exists if os.path.isfile(file): @@ -180,7 +180,7 @@ def download_users_resume_progress(api: API) -> None: :return: None """ # Open file and read - meta = json.loads(read(f'{user_dir}/meta/meta.json')) + meta = json.loads(read(f'{USER_DIR}/meta/meta.json')) # Resume download_users_execute(api, meta['n'], @@ -215,7 +215,7 @@ def download_users_execute(api: API, n: float, print("Executing friends-chain download:") print(f"- n: {n}") print(f"- Requests per minute: 1") - print(f"- Directory: {user_dir}") + print(f"- Directory: {USER_DIR}") print(f"- Downloaded: {len(downloaded)}") print(f"- Current search set: {len(current_set)}") print(f"- Next search set: {len(next_set)}") @@ -241,7 +241,7 @@ def download_users_execute(api: API, n: float, # This user was not saved, save the user. if user not in downloaded: # Save user json - write(f'{user_dir}/users/{user.screen_name}.json', json_stringify(user._json)) + write(f'{USER_DIR}/users/{user.screen_name}.json', json_stringify(user._json)) # Add to set downloaded.add(user.screen_name) @@ -281,7 +281,7 @@ def download_users_execute(api: API, n: float, # Update meta info so that downloading can be continued meta = {'downloaded': downloaded, 'done_set': done_set, 'current_set': current_set, 'next_set': next_set, 'n': n} - write(f'{user_dir}/meta/meta.json', json_stringify(meta)) + write(f'{USER_DIR}/meta/meta.json', json_stringify(meta)) debug(f'Finished saving friends of {screen_name}') debug(f'============= Total {len(downloaded)} saved =============')