From 04a2c0aea955882d2da96fdfd5a47394376c74d2 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Tue, 23 Nov 2021 11:16:00 -0500 Subject: [PATCH] [+] Create function that creates a sample --- src/process/twitter_process.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index 5352d73..ba6e3a6 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -1,5 +1,6 @@ import json import os +import random from datetime import datetime, time from typing import NamedTuple @@ -82,6 +83,38 @@ def get_user_popularity_ranking(user: str, user_dir: str = './data/twitter/user/ return -1 +def select_user_sample(user_dir: str = './data/twitter/user/') -> None: + """ + Select our sample of 500 most popular users and 500 random users who meet the criteria. The + criteria we use is that the user must have at least 150 followers, and must have a number of + postings in between 1000 and 3250. Analyzing someone who don't post or someone who doesn't have + enough followers for interaction might not reveal useful information. + + The result will be stored in /processed/sample.json + + :param user_dir: Download directory for users + :return: None + """ + user_dir = normalize_directory(user_dir) + + # Load users + users = load_users_popularity(user_dir) + + # Find most popular, and exclude them from the random sample + most_popular = users[:500] + users = users[500:] + + # Filter by criteria + filtered = {u for u in users if 150 < u.popularity and 1000 < u.num_postings < 3250} + debug(f'There are {len(filtered)} users who meets the criteria.') + + # Sample + sample = random.sample(filtered, 500) + + # Save + write(f'{user_dir}/processed/sample.json', + json_stringify({'most_popular': most_popular, 'random': sample})) + class Posting(NamedTuple): """ Posting data (whether or not a posting is covid-related)