[U] Rename, restructure

2021-11-24 21:34:34 -05:00
parent 3c2ad5462a
commit cc90af631d
2 changed files with 46 additions and 35 deletions
@@ -99,7 +99,7 @@ def get_user_popularity_ranking(user: str) -> int:


@dataclass()
-class Sample:
+class UserSample:
    """
    This is a data class storing our different samples.
    """
@@ -147,7 +147,7 @@ def select_user_sample() -> None:
    sample = random.sample(filtered, 500)

    # Save
-    write(file, json_stringify(Sample(most_popular, sample, get_english_news_channels())))
+    write(file, json_stringify(UserSample(most_popular, sample, get_english_news_channels())))


 def get_english_news_channels() -> list[str]:
@@ -202,16 +202,16 @@ def filter_news_channels() -> None:
    write(f'{USER_DIR}/processed/sample.json', json_stringify(sample))


-def load_user_sample() -> Sample:
+def load_user_sample() -> UserSample:
    """
    Load the selected sample

    :return: None
    """
    j = json.loads(read(f'{USER_DIR}/processed/sample.json'))
-    return Sample([ProcessedUser(*u) for u in j['most_popular']],
-                  [ProcessedUser(*u) for u in j['random']],
-                  j['english_news'])
+    return UserSample([ProcessedUser(*u) for u in j['most_popular']],
+                      [ProcessedUser(*u) for u in j['random']],
+                      j['english_news'])


 class Posting(NamedTuple):
@@ -53,7 +53,7 @@ class UserFloat:


@dataclass()
-class UserSample:
+class Sample:
    name: str
    users: list[str]
    frequencies: list[UserFloat] = field(default_factory=list)
@@ -62,20 +62,15 @@ class UserSample:
    tweets: list[Posting] = field(default_factory=list)


-def view_covid_tweets_freq(users: list[str],
-                           sample_name: str) -> None:
+def view_covid_tweets_freq(sample: Sample) -> None:
    """
-    Visualize the frequency that the sampled users post about COVID. For example, someone who
-    posted every single tweet about COVID will have a frequency of 1, and someone who doesn't
-    post about COVID will have a frequency of 0.

-    :param users: Sample users
-    :param sample_name: Name of the sample
+    :param sample: Sample
    :return: None
    """
    # Init reporter
-    r = Reporter(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.md')
-    r.print(f"In {sample_name} -")
+    r = Reporter(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample.name}.md')
+    r.print(f"In {sample.name} -")

    # How many people didn't post about COVID?
    r.print("How many people didn't post about COVID:",
@@ -93,24 +88,17 @@ def view_covid_tweets_freq(users: list[str],
    r.save()

    # Graph histogram
-    plt.title(f'COVID-related posting frequency for {sample_name}')
+    plt.title(f'COVID-related posting frequency for {sample.name}')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.hist([f[1] for f in user_frequency], bins=100, color='#ffcccc')
-    plt.savefig(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample_name}.png')
+    plt.savefig(f'{REPORT_DIR}/report.report.1-covid-tweet-frequency/{sample.name}.png')


 def view_covid_tweets_pop(users: list[str],
                          sample_name: str) -> None:
    """
-    Visualize the relative popularity of the sampled users' posts about COVID. For example, if one
-    person posted a COVID post and got 1000 likes, while their other posts (including this one) got
-    an average of 1 like, they will have a relative popularity of 1000. If, on the other hand, one
-    person posted a COVID post and got 1 like, while their other posts (including this one) got an
-    average of 1000 likes, they will have a relative popularity of 1/1000.

-    To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't post
-    at all.

    :param users: Sample users
    :param sample_name: Name of the sample
@@ -163,12 +151,44 @@ def view_covid_tweets_pop(users: list[str],
    plt.savefig(f'{REPORT_DIR}/2-covid-tweet-popularity/{sample_name}.png')


+def load_samples() -> list[Sample]:
+    """
+    Load samples and calculate their data
+
+    :return: Samples
+    """
+    # Load sample, convert format
+    samples = load_user_sample()
+    samples = [Sample('500-pop', [u.username for u in samples.most_popular]),
+               Sample('500-rand', [u.username for u in samples.random]),
+               Sample('eng-news', list(samples.english_news))]
+
+    # Calculate frequencies and popularity ratios
+    for s in samples:
+        s.frequencies, s.popularity_ratios, s.tweets = calculate_sample_data(s.users)
+
+    return samples
+
+
 def calculate_sample_data(users: list[str]) -> tuple[list[UserFloat], list[UserFloat], list[Posting]]:
    """
    This function loads and calculates the frequency that a list of user posts about COVID, and
    also calculates their relative popularity of COVID posts.

-    This function also creates a combined list of all users in a sample
+    This function also creates a combined list of all users in a sample.
+
+    Frequency: the frequency that the sampled users post about COVID. For example, someone who
+    posted every single tweet about COVID will have a frequency of 1, and someone who doesn't
+    post about COVID will have a frequency of 0.
+
+    Popularity ratio: the relative popularity of the sampled users' posts about COVID. If one
+    person posted a COVID post and got 1000 likes, while their other posts (including this one) got
+    an average of 1 like, they will have a relative popularity of 1000. If, on the other hand, one
+    person posted a COVID post and got 1 like, while their other posts (including this one) got an
+    average of 1000 likes, they will have a relative popularity of 1/1000.
+
+    To prevent divide-by-zero, we ignored everyone who didn't post about covid and who didn't post
+    at all.

    :param users: Users in a sample
    :return: Frequencies, Popularity ratios, Combined tweets list for the sample
@@ -225,15 +245,6 @@ def view_covid_tweets_date(tweets: list[Posting]):


 if __name__ == '__main__':
-    # Load sample, convert format
-    samples = load_user_sample()
-    samples = [UserSample('500-pop', [u.username for u in samples.most_popular]),
-               UserSample('500-rand', [u.username for u in samples.random]),
-               UserSample('eng-news', list(samples.english_news))]
-
-    # Calculate frequencies and popularity ratios
-    for s in samples:
-        s.frequencies, s.popularity_ratios, s.tweets = calculate_sample_data(s.users)

    view_covid_tweets_freq([u.username for u in samples.most_popular], '500-pop')
    # view_covid_tweets_freq(sample.random, '500-rand')