Book_recommender / recommender_system.py
Wintersmith's picture
Upload recommender_system.py
88d6858 verified
import pandas as pd
import numpy as np
from fuzzywuzzy import process
def match_books(user_input: str, df: pd.DataFrame, min_score: float = 0.8):
# Use process.extractOne to get the best match
book_titles = df['Book-Title'].unique()
best_match = process.extractOne(user_input, book_titles)
# Check if the best match score is above the minimum score
if best_match and best_match[1] >= min_score:
result = best_match[0]
else:
result = None
return result
def recommend_books(df: pd.DataFrame, book_to_be_recommended: str) -> pd.DataFrame:
"""
The recommend_books_new function identifies users who have read a specified book,
finds other books these users have read, computes the correlation between the specified book and these other books,
and returns a DataFrame with the recommended books, their correlation scores, and average ratings.
"""
# Get relevant dataset of book's readers
book_readers = df['User-ID'][df['Book-Title'] == book_to_be_recommended]
book_readers = book_readers.tolist()
book_readers = np.unique(book_readers)
# Final dataset
books_of_book_readers = df[(df['User-ID'].isin(book_readers))]
number_of_rating_per_book = books_of_book_readers.groupby(['Book-Title']).agg('count').reset_index()
# Iterate over the number_of_user_ratings to get the highest number,
# while keeping at least 10 final records
threshold = 0
while True:
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold]
books_to_compare = books_to_compare.tolist()
print(f"Threshold: {threshold}, Number of books to compare: {len(books_to_compare)}")
if len(books_to_compare) <= 11:
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold-1]
break
threshold += 1
ratings_data_raw = books_of_book_readers[['User-ID', 'Book-Rating', 'Book-Title']][
books_of_book_readers['Book-Title'].isin(books_to_compare)]
# group by User and Book and compute mean
ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()
# reset index to see User-ID in every row
ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()
dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
# Method 1: Using pandas corr() with pairwise complete observations
correlations = dataset_for_corr.corrwith(dataset_for_corr[book_to_be_recommended], method='pearson')
# Add average ratings for each book in dataset_for_corr
average_ratings = ratings_data_raw_nodup.groupby('Book-Title')['Book-Rating'].mean().reset_index()
# Create DataFrame with correlations
correlations_df = pd.DataFrame({
'Book-Title': correlations.index,
'Correlation [%]': correlations.values,
})
# Merge correlations_df with average_ratings
correlations_df = pd.merge(correlations_df, average_ratings, on='Book-Title')
correlations_df = correlations_df.rename(columns={'Book-Rating': 'Average ratings'})
# Sort by correlation value
correlations_df = correlations_df.sort_values('Correlation [%]', ascending=False)
# convert correlation column to percentage and limit to two decimals
correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'] * 100
correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'].round(2)
# Remove the book being recommended from the list
correlations_df = correlations_df[correlations_df['Book-Title'] != book_to_be_recommended]
correlations_df = correlations_df.head(10)
return correlations_df