Spaces:
Runtime error
Runtime error
Wintersmith
commited on
Commit
•
3b528be
1
Parent(s):
dd9a11d
Upload 3 files
Browse files- app.py +44 -0
- recommender_system.py +80 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import boto3
|
4 |
+
import dotenv
|
5 |
+
import os
|
6 |
+
|
7 |
+
from recommender_system import match_books, recommend_books
|
8 |
+
|
9 |
+
dotenv.load_dotenv()
|
10 |
+
|
11 |
+
# Initialize S3 client and load data
|
12 |
+
s3 = boto3.client('s3',
|
13 |
+
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
|
14 |
+
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'))
|
15 |
+
bucket_name = 'martinbucket1'
|
16 |
+
obj_data = s3.get_object(Bucket=bucket_name, Key="Processed_data.csv")
|
17 |
+
dataframe = pd.read_csv(obj_data["Body"], encoding='cp1251', sep=',', low_memory=False)
|
18 |
+
|
19 |
+
|
20 |
+
def recommend_books_interface(selected_book) -> tuple:
|
21 |
+
matched_title = match_books(selected_book, dataframe)
|
22 |
+
if matched_title:
|
23 |
+
correlations_df = recommend_books(dataframe, matched_title)
|
24 |
+
message = f"Recommending these books based on your interest in: {matched_title}"
|
25 |
+
return correlations_df, message
|
26 |
+
else:
|
27 |
+
return pd.DataFrame({"Error": ["No matching book found"]}), "No books found"
|
28 |
+
|
29 |
+
|
30 |
+
# Gradio interface
|
31 |
+
inputs = gr.Textbox(lines=1, placeholder="Type a book title here...")
|
32 |
+
message_output = gr.Markdown()
|
33 |
+
outputs = gr.Dataframe()
|
34 |
+
|
35 |
+
demo = gr.Interface(fn=recommend_books_interface, inputs=inputs, outputs=[outputs, message_output],
|
36 |
+
title="Book Recommender System",
|
37 |
+
description="Enter a book title to get recommendations based on similarity.",
|
38 |
+
fill_width=True,
|
39 |
+
flagging_mode='never',
|
40 |
+
theme=gr.themes.Soft())
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
demo.launch(share=True)
|
recommender_system.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from fuzzywuzzy import process
|
4 |
+
|
5 |
+
|
6 |
+
def match_books(user_input: str, df: pd.DataFrame, min_score: float = 0.8):
|
7 |
+
# Use process.extractOne to get the best match
|
8 |
+
book_titles = df['Book-Title'].unique()
|
9 |
+
best_match = process.extractOne(user_input, book_titles)
|
10 |
+
# Check if the best match score is above the minimum score
|
11 |
+
if best_match and best_match[1] >= min_score:
|
12 |
+
result = best_match[0]
|
13 |
+
else:
|
14 |
+
result = None
|
15 |
+
return result
|
16 |
+
|
17 |
+
|
18 |
+
def recommend_books(df: pd.DataFrame, book_to_be_recommended: str) -> pd.DataFrame:
|
19 |
+
"""
|
20 |
+
The recommend_books_new function identifies users who have read a specified book,
|
21 |
+
finds other books these users have read, computes the correlation between the specified book and these other books,
|
22 |
+
and returns a DataFrame with the recommended books, their correlation scores, and average ratings.
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Get relevant dataset of book's readers
|
26 |
+
book_readers = df['User-ID'][df['Book-Title'] == book_to_be_recommended]
|
27 |
+
book_readers = book_readers.tolist()
|
28 |
+
book_readers = np.unique(book_readers)
|
29 |
+
|
30 |
+
# Final dataset
|
31 |
+
books_of_book_readers = df[(df['User-ID'].isin(book_readers))]
|
32 |
+
number_of_rating_per_book = books_of_book_readers.groupby(['Book-Title']).agg('count').reset_index()
|
33 |
+
|
34 |
+
# Iterate over the number_of_user_ratings to get the highest number,
|
35 |
+
# while keeping at least 10 final records
|
36 |
+
threshold = 0
|
37 |
+
while True:
|
38 |
+
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold]
|
39 |
+
books_to_compare = books_to_compare.tolist()
|
40 |
+
print(f"Threshold: {threshold}, Number of books to compare: {len(books_to_compare)}")
|
41 |
+
if len(books_to_compare) <= 11:
|
42 |
+
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold-1]
|
43 |
+
break
|
44 |
+
threshold += 1
|
45 |
+
|
46 |
+
ratings_data_raw = books_of_book_readers[['User-ID', 'Book-Rating', 'Book-Title']][
|
47 |
+
books_of_book_readers['Book-Title'].isin(books_to_compare)]
|
48 |
+
|
49 |
+
# group by User and Book and compute mean
|
50 |
+
ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()
|
51 |
+
|
52 |
+
# reset index to see User-ID in every row
|
53 |
+
ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()
|
54 |
+
|
55 |
+
dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
|
56 |
+
|
57 |
+
# Method 1: Using pandas corr() with pairwise complete observations
|
58 |
+
correlations = dataset_for_corr.corrwith(dataset_for_corr[book_to_be_recommended], method='pearson')
|
59 |
+
|
60 |
+
# Add average ratings for each book in dataset_for_corr
|
61 |
+
average_ratings = ratings_data_raw_nodup.groupby('Book-Title')['Book-Rating'].mean().reset_index()
|
62 |
+
|
63 |
+
# Create DataFrame with correlations
|
64 |
+
correlations_df = pd.DataFrame({
|
65 |
+
'Book-Title': correlations.index,
|
66 |
+
'Correlation': correlations.values,
|
67 |
+
})
|
68 |
+
|
69 |
+
# Merge correlations_df with average_ratings
|
70 |
+
correlations_df = pd.merge(correlations_df, average_ratings, on='Book-Title')
|
71 |
+
correlations_df = correlations_df.rename(columns={'Book-Rating': 'Average ratings'})
|
72 |
+
|
73 |
+
# Sort by correlation value
|
74 |
+
correlations_df = correlations_df.sort_values('Correlation', ascending=False)
|
75 |
+
|
76 |
+
# Remove the book being recommended from the list
|
77 |
+
correlations_df = correlations_df[correlations_df['Book-Title'] != book_to_be_recommended]
|
78 |
+
correlations_df = correlations_df.head(10)
|
79 |
+
|
80 |
+
return correlations_df
|
requirements.txt
ADDED
Binary file (2.33 kB). View file
|
|