diff --git a/src/process/twitter_process.py b/src/process/twitter_process.py index bee5806..a2193ee 100644 --- a/src/process/twitter_process.py +++ b/src/process/twitter_process.py @@ -110,7 +110,7 @@ class UserSample: def select_user_sample() -> None: """ - Select our sample of 500 most popular users and 500 random users who meet the criteria. The + Select our sample of the 500 most popular users and 500 random users who meet the criteria. The criteria we use is that the user must have at least 150 followers, and must have a number of postings in between 1000 and 3250. Analyzing someone who don't post or someone who doesn't have enough followers for interaction might not reveal useful information. We also filter based on @@ -284,7 +284,7 @@ def is_covid_related(text: str) -> bool: :return: Whether the text is covid related """ # English - # We're hesitate to include words like "pandemic" or "vaccine" because they might refer to other + # We're hesitant to include words like "pandemic" or "vaccine" because they might refer to other # pandemics or other vaccines. However, I think we need to include "the pandemic" because many # posts refer to covid only as "the pandemic." keywords = ['covid', 'the pandemic', 'lockdown', 'spikevax', 'comirnaty', 'vaxzevria', diff --git a/src/utils.py b/src/utils.py index ff5b1a6..2c0a498 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,3 +1,10 @@ +"""This module contains useful functions and classes, including: +- debug messages +- file I/O +- statistics functions, removing outliers and averaging values over a period +- date-related functions +- classes for configs, reports, statistics, and JSON""" + import dataclasses import inspect import json @@ -57,7 +64,7 @@ def load_config(path: str = 'config.json5') -> Config: def debug(msg: object) -> None: """ - Output a debug message + Output a debug message, usually from another function :param msg: Message """ @@ -71,7 +78,7 @@ def calculate_rate_delay(rate_limit: float) -> float: Calculate the rate delay for each request given rate limit in request per minute :param rate_limit: Rate limit in requests per minute - :return: Rate delay in seconds per request (added one second just to be safe) + :return: Rate delay in seconds per request """ return 1 / rate_limit * 60 @@ -263,7 +270,7 @@ def parse_date_only(iso: str) -> datetime: def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], None, None]: """ - Date range for looping + Date range for looping, excluding the end date :param start_date: Start date in "YYYY-MM-DD" format :param end_date: End date in "YYYY-MM-DD" format @@ -278,7 +285,9 @@ def daterange(start_date: str, end_date: str) -> Generator[tuple[str, datetime], def map_to_dates(y: dict[str, Union[int, float]], dates: list[str], default: float = 0) -> list[float]: """ - Map y axis to date + Takes y-axis data in the form of a mapping of date to values, and returns a list of all the + values mapped to the date in dates. If a date in dates isn't in y, then the default values is + used instead. Preconditions: - The date in dates must be in the same format as the dates in the keys of y @@ -308,18 +317,24 @@ def filter_days_avg(y: list[float], n: int) -> list[float]: if n % 2 != 1: ValueError(f'n must be odd (you entered {n})') - # Calculate - results = [] - buffer = [y[0] * n // 2] + # Sliding window; maintain a sum of an interval centered around i + # if the interval exceeds the beginning/end, pretend that the first/last elements are "extended" + radius = n // 2 + current_sum = (radius + 1) * y[0] # current sum is sum(y[-r:0] + y[0:1] + y[1:r + 1]) + for i in range(radius): + i = min(i, len(y) - 1) + current_sum += y[i] # adding the values in y[1:r + 1] + + ret = [] for i in range(len(y)): - buffer.append(y[i]) - results.append(sum(buffer) / len(buffer)) - - # Queue longer than n days, remove first - if len(buffer) > n: - buffer.pop(0) - return results + l, r = i - radius, i + radius + l = max(0, l) # avoid index out of bounds by "extending" first/last element + r = min(r, len(y) - 1) + current_sum += y[r] # extend sliding window + ret.append(current_sum / n) + current_sum -= y[l] # remove old values + return ret def divide_zeros(numerator: list[float], denominator: list[float]) -> list[float]: