From 91e028eba087fbdc325f0a9852813319549edb95 Mon Sep 17 00:00:00 2001 From: Hykilpikonna Date: Wed, 24 Nov 2021 15:57:28 -0500 Subject: [PATCH] [+] Create function to remove outliers --- requirements.txt | 17 ++++++++++------- src/utils.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index aca2061..29b831a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,25 +1,28 @@ +#################### +# Data Collection # Json5 is a human-readable json format that allows for things such as unquoted keys or comments. json5~=0.9.6 - # Tweepy is a python SDK for twitter tweepy==4.4.0 - # requests is for getting html from a website URL requests==2.26.0 - # beautifulsoup is used to extract data from html beautifulsoup4==4.10.0 -# 7zip packing utility for packing our processed data -py7zr==0.16.3 - ##################### -# Data visualization +# Data Visualization # Print table data tabulate==0.8.9 # Draw local graphs matplotlib==3.5.0 +# Calculate data statistics +numpy==1.21.4 + +#################### +# Data Packing +# 7zip packing utility for packing our processed data +py7zr==0.16.3 ##################### # Testing and code checking diff --git a/src/utils.py b/src/utils.py index dff3af9..85578c4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Union import json5 +import numpy as np @dataclass @@ -98,6 +99,29 @@ def read(file: str) -> str: return f.read() +def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float]: + """ + Create list with outliers removed for graphing + + Credit to: https://stackoverflow.com/a/11886564/7346633 + + :param points: Input points list + :param z_threshold: Z threshold for identifying whether or not a point is an outlier + :return: List with outliers removed + """ + points = np.array(points) + if len(points.shape) == 1: + points = points[:, None] + median = np.median(points, axis=0) + diff = np.sum((points - median)**2, axis=-1) + diff = np.sqrt(diff) + med_abs_deviation = np.median(diff) + + modified_z_score = 0.6745 * diff / med_abs_deviation + + is_outlier = modified_z_score > z_threshold + + class EnhancedJSONEncoder(json.JSONEncoder): def default(self, o):