[+] Add language field to processed users

This commit is contained in:
Hykilpikonna
2021-11-23 12:02:29 -05:00
parent b2137e3bf1
commit b17df5dfa2
+15 -8
View File
@@ -1,11 +1,8 @@
import json
import os
import random
from datetime import datetime, time
from dataclasses import dataclass
from typing import NamedTuple
from utils import *
from dataclasses import dataclass
class UserPopularity(NamedTuple):
@@ -24,6 +21,8 @@ class UserPopularity(NamedTuple):
popularity: int
# Number of tweets
num_postings: int
# Language
lang: str
def process_users_popularity(user_dir: str = './data/twitter/user/') -> None:
@@ -47,8 +46,17 @@ def process_users_popularity(user_dir: str = './data/twitter/user/') -> None:
if filename.endswith('.json') and not filename.startswith('.'):
# Read
user = json.loads(read(f'{user_dir}/users/{filename}'))
# Get user language (The problem is, most people's lang field are null, so we have to
# look at the language of their latest status as well, while they might not have a
# status field as well!)
lang = user['lang']
status_lang = user['status']['lang'] if 'status' in user else None
if lang is None:
lang = status_lang
users.append(UserPopularity(user['screen_name'], user['followers_count'],
user['statuses_count']))
user['statuses_count'], lang))
# Log progress
if len(users) % 2000 == 0:
@@ -99,7 +107,8 @@ def select_user_sample(user_dir: str = './data/twitter/user/') -> None:
Select our sample of 500 most popular users and 500 random users who meet the criteria. The
criteria we use is that the user must have at least 150 followers, and must have a number of
postings in between 1000 and 3250. Analyzing someone who don't post or someone who doesn't have
enough followers for interaction might not reveal useful information.
enough followers for interaction might not reveal useful information. We also filter based on
language, because we only know how to identify COVID-related posts in a few languages.
The result will be stored in <user_dir>/processed/sample.json
@@ -141,8 +150,6 @@ def load_user_sample(user_dir: str = './data/twitter/user/') -> Sample:
[UserPopularity(*u) for u in j['random']])
class Posting(NamedTuple):
"""
Posting data (whether or not a posting is covid-related)