[+] Create function to check if a post is covid-related
This commit is contained in:
@@ -61,5 +61,34 @@ def load_users_popularity(user_dir: str = './data/twitter/user/') -> list[UserPo
|
||||
:return: List of users' screen names and popularity, sorted descending by popularity.
|
||||
"""
|
||||
user_dir = normalize_directory(user_dir)
|
||||
with open(f'{user_dir}/processed_popularity.json', 'r', encoding='utf-8') as f:
|
||||
return [UserPopularity(*u) for u in json.load(f)]
|
||||
return [UserPopularity(*u) for u in json.loads(read(f'{user_dir}/processed/popularity.json'))]
|
||||
|
||||
|
||||
class Posting(NamedTuple):
|
||||
"""
|
||||
Posting data (whether or not a posting is covid-related)
|
||||
"""
|
||||
# Full text of the post's content
|
||||
covid_related: bool
|
||||
# Popularity of the post
|
||||
popularity: int
|
||||
# Is it a repost
|
||||
repost: bool
|
||||
# Date
|
||||
date: datetime
|
||||
|
||||
def is_covid_related(text: str) -> bool:
|
||||
"""
|
||||
Is a tweet / article covid-related. Currently, this is done through keyword matching. Even
|
||||
though we know that not all posts with covid-related words are covid-related posts, this is our
|
||||
current best method of classification.
|
||||
|
||||
:param text: Text content
|
||||
:return: Whether the text is covid related
|
||||
"""
|
||||
# We cannot include words like "pandemic" or "vaccine" because they might refer to other
|
||||
# pandemics or other vaccines. However, I think we need to include "the pandemic" because many
|
||||
# posts refer to covid only as "the pandemic".
|
||||
keywords = 'covid; the pandemic; lockdown; spikevax; comirnaty; vaxzevria; 疫情'.split('; ')
|
||||
return any(k in text for k in keywords)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user