From acfb397c9c06ddca7393b1cc5044d501f6158525 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Mon, 22 Nov 2021 14:11:02 -0500 Subject: [PATCH] [+] Create function to check if a post is covid-related --- src/process/twitter_process.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index 2f92e52..0103285 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -61,5 +61,34 @@ def load_users_popularity(user_dir: str = './data/twitter/user/') -> list[UserPo :return: List of users' screen names and popularity, sorted descending by popularity. """ user_dir = normalize_directory(user_dir) - with open(f'{user_dir}/processed_popularity.json', 'r', encoding='utf-8') as f: - return [UserPopularity(*u) for u in json.load(f)] + return [UserPopularity(*u) for u in json.loads(read(f'{user_dir}/processed/popularity.json'))] + + +class Posting(NamedTuple): + """ + Posting data (whether or not a posting is covid-related) + """ + # Full text of the post's content + covid_related: bool + # Popularity of the post + popularity: int + # Is it a repost + repost: bool + # Date + date: datetime + +def is_covid_related(text: str) -> bool: + """ + Is a tweet / article covid-related. Currently, this is done through keyword matching. Even + though we know that not all posts with covid-related words are covid-related posts, this is our + current best method of classification. + + :param text: Text content + :return: Whether the text is covid related + """ + # We cannot include words like "pandemic" or "vaccine" because they might refer to other + # pandemics or other vaccines. However, I think we need to include "the pandemic" because many + # posts refer to covid only as "the pandemic". + keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ') + return any(k in text for k in keywords) +