[+] Create function to check if a post is covid-related

2021-11-22 14:11:02 -05:00
parent 9e27a3c725
commit acfb397c9c
1 changed files with 31 additions and 2 deletions
@@ -61,5 +61,34 @@ def load_users_popularity(user_dir: str = './data/twitter/user/') -> list[UserPo
    :return: List of users' screen names and popularity, sorted descending by popularity.
    """
    user_dir = normalize_directory(user_dir)
-    with open(f'{user_dir}/processed_popularity.json', 'r', encoding='utf-8') as f:
-        return [UserPopularity(*u) for u in json.load(f)]
+    return [UserPopularity(*u) for u in json.loads(read(f'{user_dir}/processed/popularity.json'))]
+
+
+class Posting(NamedTuple):
+    """
+    Posting data (whether or not a posting is covid-related)
+    """
+    # Full text of the post's content
+    covid_related: bool
+    # Popularity of the post
+    popularity: int
+    # Is it a repost
+    repost: bool
+    # Date
+    date: datetime
+
+def is_covid_related(text: str) -> bool:
+    """
+    Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
+    though we know that not all posts with covid-related words are covid-related posts, this is our
+    current best method of classification.
+
+    :param text: Text content
+    :return: Whether the text is covid related
+    """
+    # We cannot include words like "pandemic" or "vaccine" because they might refer to other
+    # pandemics or other vaccines. However, I think we need to include "the pandemic" because many
+    # posts refer to covid only as "the pandemic".
+    keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ')
+    return any(k in text for k in keywords)
+