From acfb397c9c06ddca7393b1cc5044d501f6158525 Mon Sep 17 00:00:00 2001
From: Hykilpikonna <me@hydev.org>
Date: Mon, 22 Nov 2021 14:11:02 -0500
Subject: [PATCH] [+] Create function to check if a post is covid-related

---
 src/process/twitter_process.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py
index 2f92e52..0103285 100644
--- a/src/process/twitter_process.py
+++ b/src/process/twitter_process.py
@@ -61,5 +61,34 @@ def load_users_popularity(user_dir: str = './data/twitter/user/') -> list[UserPo
     :return: List of users' screen names and popularity, sorted descending by popularity.
     """
     user_dir = normalize_directory(user_dir)
-    with open(f'{user_dir}/processed_popularity.json', 'r', encoding='utf-8') as f:
-        return [UserPopularity(*u) for u in json.load(f)]
+    return [UserPopularity(*u) for u in json.loads(read(f'{user_dir}/processed/popularity.json'))]
+
+
+class Posting(NamedTuple):
+    """
+    Posting data (whether or not a posting is covid-related)
+    """
+    # Full text of the post's content
+    covid_related: bool
+    # Popularity of the post
+    popularity: int
+    # Is it a repost
+    repost: bool
+    # Date
+    date: datetime
+
+def is_covid_related(text: str) -> bool:
+    """
+    Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
+    though we know that not all posts with covid-related words are covid-related posts, this is our
+    current best method of classification.
+
+    :param text: Text content
+    :return: Whether the text is covid related
+    """
+    # We cannot include words like "pandemic" or "vaccine" because they might refer to other
+    # pandemics or other vaccines. However, I think we need to include "the pandemic" because many
+    # posts refer to covid only as "the pandemic".
+    keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ')
+    return any(k in text for k in keywords)
+