CSC111/assignments/A3/a3_part2_predictions.py

"""CSC111 Winter 2022 Assignment 3: Graphs, Recommender Systems, and Clustering (Part 2)

Instructions (READ THIS FIRST!)
===============================

This Python module contains classes responsible for making predictions of book review scores.
We've provided the abstract class and some example subclasses, and you'll complete one new
subclass and a new function to evaluate the different classes.

Copyright and Usage Information
===============================

This file is provided solely for the personal and private use of students
taking CSC111 at the University of Toronto St. George campus. All forms of
distribution of this code, whether as given or with any changes, are
expressly prohibited. For more information on copyright for CSC111 materials,
please consult our Course Syllabus.

This file is Copyright (c) 2022 Mario Badr, David Liu, and Isaac Waller.
"""
from __future__ import annotations
import csv
from typing import Union

import a3_part2_recommendations


class ReviewScorePredictor:
    """A graph-based entity that predicts scores for book reviews.

    This is an abstract class, and should be subclasses to implement different review
    prediction algorithms.

    Instance Attributes:
        - graph: The book review graph that this entity uses to make predictions.
    """
    graph: a3_part2_recommendations.WeightedGraph

    def __init__(self, graph: a3_part2_recommendations.WeightedGraph) -> None:
        """Initialize a new ReviewScorePredictor."""
        self.graph = graph

    def predict_review_score(self, user: str, book: str) -> int:
        """Predict the score (1-5) that the given user would give the given book.

        If there is already an edge between the given user and book in the graph,
        return that score. Otherwise, return a predicted score.

        Preconditions:
            - user in self.graph._vertices
            - book in self.graph._vertices
        """
        raise NotImplementedError


class FiveStarPredictor(ReviewScorePredictor):
    """A book review predictor that always predicts a five-star review,
    ignoring the actual book and user.
    """
    def predict_review_score(self, user: str, book: str) -> int:
        """Predict the score that the given user would give the given book.

        If there is already an edge between the given user and book in the graph,
        return that score. Otherwise, return 5 as the predicted score.

        Preconditions:
            - user in self.graph._vertices
            - book in self.graph._vertices
        """
        if self.graph.adjacent(user, book):
            return self.graph.get_weight(user, book)
        else:
            return 5


class BookAverageScorePredictor(ReviewScorePredictor):
    """A book review predictor that always predicts based on the book's average score,
    ignoring any user preferences.
    """
    def predict_review_score(self, user: str, book: str) -> int:
        """Predict the score that the given user would give the given book.

        If there is already an edge between the given user and book in the graph,
        return that score. Otherwise, return the book's average review score in
        the graph, rounded to the nearest integer (using the built-in `round` function).

        Preconditions:
            - user in self.graph._vertices
            - book in self.graph._vertices
            - the given book has at least one review
        """
        if self.graph.adjacent(user, book):
            return self.graph.get_weight(user, book)
        else:
            return round(self.graph.average_weight(book))


################################################################################
# Part 2, Q3
################################################################################
class SimilarUserPredictor(ReviewScorePredictor):
    """A book review predictor that makes a prediction based on how similar users rated the book.

    Representation Invariants:
        - self._score_type in {'unweighted', 'strict'}
    """
    # Private Instance Attributes:
    #   - _score_type: the type of similarity score to use when computing similarity score
    _score_type: str

    def __init__(self, graph: a3_part2_recommendations.WeightedGraph,
                 score_type: str = 'unweighted') -> None:
        """Initialize a new SimilarUserPredictor.

        You may want to review Section 10.4 of the Course Notes for a reminder on
        how to properly override a superclass initializer. To avoid a python_ta.contracts error,
        initialize self._score_type at the TOP of this method body.
        """
        self._score_type = score_type
        ReviewScorePredictor.__init__(self, graph)

    def predict_review_score(self, user: str, book: str) -> int:
        """Predict the score that the given user would give the given book.

        If there is already an edge between the given user and book in the graph,
        return that score. Otherwise, return the book's WEIGHTED review score among
        all users who have read the book, where the weight used is the similarity
        score of the reviewing user with the given user. self._score_type is used
        to determine which similarity score to use for the weights

        As usual, round this score using the built-in `round` function.

        For example, suppose there are three users A, B, C who have read the book,
        and one, D, who has not. We want to use the review scores of A, B, and C to predict
        the rating for D. The three user ratings and weighted similarity score with D
        are shown in this table:

            | User | Review score | Weighted similarity score with D |
            | ---- | ------------ | -------------------------------- |
            | A    | 3            | 0.4                              |
            | B    | 5            | 0.1                              |
            | C    | 2            | 0.3                              |

        Then the predicted review for D equals:

            (3 * 0.4 + 5 * 0.1 + 2 * 0.3) / (0.4 + 0.1 + 0.3) = 2.875

        and so this function would return 3.

        If the total similarity score from all of the book's reviewers is 0,
        then instead return the book's average review score (same as BookAverageScorePredictor).

        Preconditions:
            - user in self.graph._vertices
            - book in self.graph._vertices
        """
        if self.graph.adjacent(user, book):  # if the user already made a review, use that score
            return self.graph.get_weight(user, book)
        users = self.graph.get_neighbours(book)
        total_weighted = 0
        total = 0
        all_zero = True
        for u in users:
            weight = self.graph.get_similarity_score(u, book, 'strict')
            score = self.graph.get_weight(u, book)
            if weight > 0:
                all_zero = False
            total_weighted += score * weight
            total += score
        if all_zero:
            return round(total / len(users))
        return round(total_weighted / len(users))


################################################################################
# Part 2, Q4
################################################################################
def evaluate_predictor(predictor: ReviewScorePredictor,
                       test_file: str, book_names_file: str) -> dict[str, Union[int, float]]:
    """Evaluate the given ReviewScorePredictor on the given test file.

    Read in each row of the given test_file (which contains a book, user, and
    review score). For each row, use the given predictor to make a prediction of the review
    score, and compare that prediction against the actual given review score from the file.

    Return a dictionary summarizing the performance of the predictor. This dictionary
    has the following keys:
        - 'num_reviews': the total number of predicted review scores (equal to the
            number of lines in the CSV file)
        - 'num_correct': the number of predicted review scores that exactly matched the
            actual review score
        - 'average_error': the average of the *absolute value difference* between
            predicted and actual review scores across all reviews in the test file

    Preconditions:
        - test_file is the path to a CSV file corresponding to the book review data
          format described on the assignment handout
        - book_names_file is the path to a CSV file corresponding to the book data
        - test_file has at least one row
        - all users and books in test_file are in predictor.graph
          format described on the assignment handout
    """
    num_reviews = 0
    num_correct = 0
    total_error = 0
    mp: dict[str, str]  # maps book ID to book name
    with open(book_names_file, 'r', newline='', encoding='UTF-8') as f:
        reader = csv.reader(f)
        mp = dict(reader)
    with open(test_file, 'r', newline='', encoding='UTF-8') as f:
        reader = csv.reader(f)
        for book, user, score in reader:
            book = mp[book]
            num_reviews += 1
            actual = predictor.predict_review_score(user, book)
            if actual == score:
                num_correct += 1
            total_error += abs(score - actual)
    return {
        'num_reviews': num_reviews,
        'num_correct': num_correct,
        'average_error': total_error / num_reviews,
    }


if __name__ == '__main__':
    # You can uncomment the following lines for code checking/debugging purposes.
    # However, we recommend commenting out these lines when working with the large
    # datasets, as checking representation invariants and preconditions greatly
    # increases the running time of the functions/methods.
    # import python_ta.contracts
    # python_ta.contracts.check_all_contracts()

    import python_ta
    python_ta.check_all(config={
        'max-line-length': 1000,
        'disable': ['E1136'],
        'extra-imports': ['csv', 'a3_part2_recommendations'],
        'allowed-io': ['evaluate_predictor'],
        'max-nested-blocks': 4
    })