[+] Create function to remove outliers

This commit is contained in:
Hykilpikonna
2021-11-24 15:57:28 -05:00
parent ea27d0fec2
commit 91e028eba0
2 changed files with 34 additions and 7 deletions
+10 -7
View File
@@ -1,25 +1,28 @@
####################
# Data Collection
# Json5 is a human-readable json format that allows for things such as unquoted keys or comments.
json5~=0.9.6
# Tweepy is a python SDK for twitter
tweepy==4.4.0
# requests is for getting html from a website URL
requests==2.26.0
# beautifulsoup is used to extract data from html
beautifulsoup4==4.10.0
# 7zip packing utility for packing our processed data
py7zr==0.16.3
#####################
# Data visualization
# Data Visualization
# Print table data
tabulate==0.8.9
# Draw local graphs
matplotlib==3.5.0
# Calculate data statistics
numpy==1.21.4
####################
# Data Packing
# 7zip packing utility for packing our processed data
py7zr==0.16.3
#####################
# Testing and code checking
+24
View File
@@ -8,6 +8,7 @@ from pathlib import Path
from typing import Union
import json5
import numpy as np
@dataclass
@@ -98,6 +99,29 @@ def read(file: str) -> str:
return f.read()
def remove_outliers(points: list[float], z_threshold: float = 3.5) -> list[float]:
"""
Create list with outliers removed for graphing
Credit to: https://stackoverflow.com/a/11886564/7346633
:param points: Input points list
:param z_threshold: Z threshold for identifying whether or not a point is an outlier
:return: List with outliers removed
"""
points = np.array(points)
if len(points.shape) == 1:
points = points[:, None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
is_outlier = modified_z_score > z_threshold
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):