From d16032aa7112e265337d1ed66198c337a914fdee Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Mon, 22 Nov 2021 14:28:36 -0500 Subject: [PATCH] [+] Process tweets --- src/process/twitter_process.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index 0103285..6d9b4a8 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -1,6 +1,6 @@ import json import os -from datetime import datetime +from datetime import datetime, time from typing import NamedTuple from utils import * @@ -77,6 +77,27 @@ class Posting(NamedTuple): # Date date: datetime + +def process_tweets(tweets_dir: str = './data/twitter/user-tweets/') -> None: + tweets_dir = normalize_directory(tweets_dir) + + # Loop through all the files + for filename in os.listdir(f'{tweets_dir}/user'): + # Only check json files and ignore macos dot files + if filename.endswith('.json') and not filename.startswith('.'): + # Read + tweets = json.loads(read(f'{tweets_dir}/user/{filename}')) + p = [Posting(is_covid_related(t['full_text']), + t['favorite_count'] + t['retweet_count'], + 'retweeted_status' in t, + datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y')) + for t in tweets] + + # Save data + write(f'{tweets_dir}/processed/{filename}', json_stringify(p, indent=None)) + debug(f'Processed: {filename}') + + def is_covid_related(text: str) -> bool: """ Is a tweet / article covid-related. Currently, this is done through keyword matching. Even @@ -91,4 +112,3 @@ def is_covid_related(text: str) -> bool: # posts refer to covid only as "the pandemic". keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ') return any(k in text for k in keywords) -