diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index bda9a64..b8281ad 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -1,11 +1,8 @@ -import json -import os import random -from datetime import datetime, time +from dataclasses import dataclass from typing import NamedTuple from utils import * -from dataclasses import dataclass class UserPopularity(NamedTuple): @@ -24,6 +21,8 @@ class UserPopularity(NamedTuple): popularity: int # Number of tweets num_postings: int + # Language + lang: str def process_users_popularity(user_dir: str = './data/twitter/user/') -> None: @@ -47,8 +46,17 @@ def process_users_popularity(user_dir: str = './data/twitter/user/') -> None: if filename.endswith('.json') and not filename.startswith('.'): # Read user = json.loads(read(f'{user_dir}/users/{filename}')) + + # Get user language (The problem is, most people's lang field are null, so we have to + # look at the language of their latest status as well, while they might not have a + # status field as well!) + lang = user['lang'] + status_lang = user['status']['lang'] if 'status' in user else None + if lang is None: + lang = status_lang + users.append(UserPopularity(user['screen_name'], user['followers_count'], - user['statuses_count'])) + user['statuses_count'], lang)) # Log progress if len(users) % 2000 == 0: @@ -99,7 +107,8 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None: Select our sample of 500 most popular users and 500 random users who meet the criteria. The criteria we use is that the user must have at least 150 followers, and must have a number of postings in between 1000 and 3250. Analyzing someone who don't post or someone who doesn't have - enough followers for interaction might not reveal useful information. + enough followers for interaction might not reveal useful information. We also filter based on + language, because we only know how to identify COVID-related posts in a few languages. The result will be stored in /processed/sample.json @@ -141,8 +150,6 @@ def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample: [UserPopularity(*u) for u in j['random']]) - - class Posting(NamedTuple): """ Posting data (whether or not a posting is covid-related)