diff --git a/src/process/twitter_visualization.py b/src/process/twitter_visualization.py index 211be09..a52a3f9 100644 --- a/src/process/twitter_visualization.py +++ b/src/process/twitter_visualization.py @@ -279,6 +279,8 @@ def graph_histogram(x: list[float], path: str, title: str, clear_outliers: bool # Save fig.savefig(os.path.join(REPORT_DIR, path)) + fig.clf() + plt.close(fig) def graph_line_plot(x: list[datetime], y: list[float], path: str, title: str, freq: bool, @@ -345,6 +347,8 @@ def graph_line_plot(x: list[datetime], y: list[float], path: str, title: str, fr path = Path(os.path.join(REPORT_DIR, path)) path.parent.mkdir(parents=True, exist_ok=True) fig.savefig(str(path)) + fig.clf() + plt.close(fig) def report_histograms(sample: Sample) -> None: @@ -409,6 +413,15 @@ def report_change_different_n(sample: Sample) -> None: False, n) +def report_change_graphs(sample: Sample) -> None: + graph_line_plot(sample.dates, sample.date_pops, f'change/pop/{sample.name}.png', + f'COVID-posting popularity ratio over time for {sample.name} IIR(10)', + False, 10) + graph_line_plot(sample.dates, sample.date_freqs, f'change/freq/{sample.name}.png', + f'COVID-posting frequency over time for {sample.name} IIR(10)', + True, 10) + + def report_all() -> None: """ Generate all reports @@ -429,6 +442,7 @@ def report_all() -> None: for s in samples: report_top_20_tables(s) report_histograms(s) + report_change_graphs(s) report_change_different_n(samples[0]) diff --git a/src/report/report_document.md b/src/report/report_document.md index 5ea2dc0..e69f948 100644 --- a/src/report/report_document.md +++ b/src/report/report_document.md @@ -73,17 +73,25 @@ After we answered how frequently people posted about COVID-19 and how interested ## Method -This analysis is separate for each of our samples, just like the previous analysis. However, unlike how tweets are separated for each user in the previous analysis, we combine the tweets of all users in each sample in this analysis. In this analysis, we defined the start of COVID-19 as `2020-01-01` and ignored all posts prior to this date. Then, we calculate the average frequency and popularity ratio for every day since `2020-01-01`. To reduce random variability, instead of using only the data from the day in the calculation, we used the average of the last 7 days for each day, and calculated the frequency and popularity of that interval. This calculation gave us a list `freqs` and a list `pops` where, for every date `dates[i]`, +This analysis is separate for each of our samples, just like the previous analysis. However, unlike how tweets are separated for each user in the previous analysis, we combine the tweets of all users in each sample in this analysis. In this analysis, we defined the start of COVID-19 as _2020-01-01_ and ignored all posts prior to this date. Then, we calculate the average frequency and popularity ratio for every day since _2020-01-01_. This calculation gave us a list `freqs` and a list `pops` where, for every date `dates[i]`,
-$$ \text{freqs}_i = -\sum_{j = 0}^{7} \frac{|\text{COVID-posts on dates}_{i - j}|}{|\text{All posts on dates}_{i - j}|} $$ +$$ \text{freqs}_i = \frac{|\text{COVID-posts on date}_{i}|}{|\text{All posts on date}_{i}|} $$
-$$ \text{pops}_i = \sum_{j = 0}^{7} \left(\frac{\sum\text{Popularity of COVID-posts}}{|\text{COVID-posts}|}\right) / \left(\frac{\sum \text{Popularity of all posts}}{|\text{All posts}|}\right) $$ +$$ \text{pops}_i = \left(\frac{\sum\text{Popularity of COVID-posts on date}_i}{|\text{COVID-posts on date}_i|}\right) / \left(\frac{\sum \text{Popularity of all posts on date}_i}{|\text{All posts on date}_i|}\right) $$
+After calculation, we decided to plot line charts of `freqs` or `pops` against `dates`. When we plot the graph without a filter, we found that the graph is actually very noisy as shown in the first graph below. So, we experimented with different filters from the `scipy` library and different parameter values, and chose to use an IIR filter with `n = 10`. + +
+
hist
+
hist
+
hist
+
+ + **_TODO_** ## References