[+] Create function to check if a post is covid-related

This commit is contained in:
Hykilpikonna
2021-11-22 14:11:02 -05:00
parent 9e27a3c725
commit acfb397c9c
+31 -2
View File
@@ -61,5 +61,34 @@ def load_users_popularity(user_dir: str = './data/twitter/user/') -> list[UserPo
:return: List of users' screen names and popularity, sorted descending by popularity.
"""
user_dir = normalize_directory(user_dir)
with open(f'{user_dir}/processed_popularity.json', 'r', encoding='utf-8') as f:
return [UserPopularity(*u) for u in json.load(f)]
return [UserPopularity(*u) for u in json.loads(read(f'{user_dir}/processed/popularity.json'))]
class Posting(NamedTuple):
"""
Posting data (whether or not a posting is covid-related)
"""
# Full text of the post's content
covid_related: bool
# Popularity of the post
popularity: int
# Is it a repost
repost: bool
# Date
date: datetime
def is_covid_related(text: str) -> bool:
"""
Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
though we know that not all posts with covid-related words are covid-related posts, this is our
current best method of classification.
:param text: Text content
:return: Whether the text is covid related
"""
# We cannot include words like "pandemic" or "vaccine" because they might refer to other
# pandemics or other vaccines. However, I think we need to include "the pandemic" because many
# posts refer to covid only as "the pandemic".
keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ')
return any(k in text for k in keywords)