Files
CSC111/assignments/A3/a3_part2_predictions.py
T
2022-03-24 00:50:30 -04:00

242 lines
9.9 KiB
Python

"""CSC111 Winter 2022 Assignment 3: Graphs, Recommender Systems, and Clustering (Part 2)
Instructions (READ THIS FIRST!)
===============================
This Python module contains classes responsible for making predictions of book review scores.
We've provided the abstract class and some example subclasses, and you'll complete one new
subclass and a new function to evaluate the different classes.
Copyright and Usage Information
===============================
This file is provided solely for the personal and private use of students
taking CSC111 at the University of Toronto St. George campus. All forms of
distribution of this code, whether as given or with any changes, are
expressly prohibited. For more information on copyright for CSC111 materials,
please consult our Course Syllabus.
This file is Copyright (c) 2022 Mario Badr, David Liu, and Isaac Waller.
"""
from __future__ import annotations
import csv
from typing import Union
import a3_part2_recommendations
class ReviewScorePredictor:
"""A graph-based entity that predicts scores for book reviews.
This is an abstract class, and should be subclasses to implement different review
prediction algorithms.
Instance Attributes:
- graph: The book review graph that this entity uses to make predictions.
"""
graph: a3_part2_recommendations.WeightedGraph
def __init__(self, graph: a3_part2_recommendations.WeightedGraph) -> None:
"""Initialize a new ReviewScorePredictor."""
self.graph = graph
def predict_review_score(self, user: str, book: str) -> int:
"""Predict the score (1-5) that the given user would give the given book.
If there is already an edge between the given user and book in the graph,
return that score. Otherwise, return a predicted score.
Preconditions:
- user in self.graph._vertices
- book in self.graph._vertices
"""
raise NotImplementedError
class FiveStarPredictor(ReviewScorePredictor):
"""A book review predictor that always predicts a five-star review,
ignoring the actual book and user.
"""
def predict_review_score(self, user: str, book: str) -> int:
"""Predict the score that the given user would give the given book.
If there is already an edge between the given user and book in the graph,
return that score. Otherwise, return 5 as the predicted score.
Preconditions:
- user in self.graph._vertices
- book in self.graph._vertices
"""
if self.graph.adjacent(user, book):
return self.graph.get_weight(user, book)
else:
return 5
class BookAverageScorePredictor(ReviewScorePredictor):
"""A book review predictor that always predicts based on the book's average score,
ignoring any user preferences.
"""
def predict_review_score(self, user: str, book: str) -> int:
"""Predict the score that the given user would give the given book.
If there is already an edge between the given user and book in the graph,
return that score. Otherwise, return the book's average review score in
the graph, rounded to the nearest integer (using the built-in `round` function).
Preconditions:
- user in self.graph._vertices
- book in self.graph._vertices
- the given book has at least one review
"""
if self.graph.adjacent(user, book):
return self.graph.get_weight(user, book)
else:
return round(self.graph.average_weight(book))
################################################################################
# Part 2, Q3
################################################################################
class SimilarUserPredictor(ReviewScorePredictor):
"""A book review predictor that makes a prediction based on how similar users rated the book.
Representation Invariants:
- self._score_type in {'unweighted', 'strict'}
"""
# Private Instance Attributes:
# - _score_type: the type of similarity score to use when computing similarity score
_score_type: str
def __init__(self, graph: a3_part2_recommendations.WeightedGraph,
score_type: str = 'unweighted') -> None:
"""Initialize a new SimilarUserPredictor.
You may want to review Section 10.4 of the Course Notes for a reminder on
how to properly override a superclass initializer. To avoid a python_ta.contracts error,
initialize self._score_type at the TOP of this method body.
"""
self._score_type = score_type
ReviewScorePredictor.__init__(self, graph)
def predict_review_score(self, user: str, book: str) -> int:
"""Predict the score that the given user would give the given book.
If there is already an edge between the given user and book in the graph,
return that score. Otherwise, return the book's WEIGHTED review score among
all users who have read the book, where the weight used is the similarity
score of the reviewing user with the given user. self._score_type is used
to determine which similarity score to use for the weights
As usual, round this score using the built-in `round` function.
For example, suppose there are three users A, B, C who have read the book,
and one, D, who has not. We want to use the review scores of A, B, and C to predict
the rating for D. The three user ratings and weighted similarity score with D
are shown in this table:
| User | Review score | Weighted similarity score with D |
| ---- | ------------ | -------------------------------- |
| A | 3 | 0.4 |
| B | 5 | 0.1 |
| C | 2 | 0.3 |
Then the predicted review for D equals:
(3 * 0.4 + 5 * 0.1 + 2 * 0.3) / (0.4 + 0.1 + 0.3) = 2.875
and so this function would return 3.
If the total similarity score from all of the book's reviewers is 0,
then instead return the book's average review score (same as BookAverageScorePredictor).
Preconditions:
- user in self.graph._vertices
- book in self.graph._vertices
"""
if self.graph.adjacent(user, book): # if the user already made a review, use that score
return self.graph.get_weight(user, book)
users = self.graph.get_neighbours(book)
total_weighted = 0
total = 0
all_zero = True
for u in users:
weight = self.graph.get_similarity_score(u, book, 'strict')
score = self.graph.get_weight(u, book)
if weight > 0:
all_zero = False
total_weighted += score * weight
total += score
if all_zero:
return round(total / len(users))
return round(total_weighted / len(users))
################################################################################
# Part 2, Q4
################################################################################
def evaluate_predictor(predictor: ReviewScorePredictor,
test_file: str, book_names_file: str) -> dict[str, Union[int, float]]:
"""Evaluate the given ReviewScorePredictor on the given test file.
Read in each row of the given test_file (which contains a book, user, and
review score). For each row, use the given predictor to make a prediction of the review
score, and compare that prediction against the actual given review score from the file.
Return a dictionary summarizing the performance of the predictor. This dictionary
has the following keys:
- 'num_reviews': the total number of predicted review scores (equal to the
number of lines in the CSV file)
- 'num_correct': the number of predicted review scores that exactly matched the
actual review score
- 'average_error': the average of the *absolute value difference* between
predicted and actual review scores across all reviews in the test file
Preconditions:
- test_file is the path to a CSV file corresponding to the book review data
format described on the assignment handout
- book_names_file is the path to a CSV file corresponding to the book data
- test_file has at least one row
- all users and books in test_file are in predictor.graph
format described on the assignment handout
"""
num_reviews = 0
num_correct = 0
total_error = 0
mp: dict[str, str] # maps book ID to book name
with open(book_names_file, 'r', newline='', encoding='UTF-8') as f:
reader = csv.reader(f)
mp = dict(reader)
with open(test_file, 'r', newline='', encoding='UTF-8') as f:
reader = csv.reader(f)
for book, user, score in reader:
book = mp[book]
num_reviews += 1
actual = predictor.predict_review_score(user, book)
if actual == score:
num_correct += 1
total_error += abs(score - actual)
return {
'num_reviews': num_reviews,
'num_correct': num_correct,
'average_error': total_error / num_reviews,
}
if __name__ == '__main__':
# You can uncomment the following lines for code checking/debugging purposes.
# However, we recommend commenting out these lines when working with the large
# datasets, as checking representation invariants and preconditions greatly
# increases the running time of the functions/methods.
# import python_ta.contracts
# python_ta.contracts.check_all_contracts()
import python_ta
python_ta.check_all(config={
'max-line-length': 1000,
'disable': ['E1136'],
'extra-imports': ['csv', 'a3_part2_recommendations'],
'allowed-io': ['evaluate_predictor'],
'max-nested-blocks': 4
})