Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- .gitattributes +1 -0
- BX-Book-Ratings.csv +3 -0
- app.py +128 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
BX-Book-Ratings.csv filter=lfs diff=lfs merge=lfs -text
|
BX-Book-Ratings.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f359084e5f350151b61f3d07acbe3c6ff0e2bd057d88901fa60507449d0e0e57
|
3 |
+
size 30682276
|
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from typing import List, Tuple
|
6 |
+
import faiss
|
7 |
+
from faiss import write_index, read_index
|
8 |
+
import gradio as gr
|
9 |
+
from fuzzywuzzy import process
|
10 |
+
|
11 |
+
# Global variables to store loaded data
|
12 |
+
dataset = None
|
13 |
+
faiss_index = None
|
14 |
+
normalized_data = None
|
15 |
+
book_titles = None
|
16 |
+
|
17 |
+
|
18 |
+
def load_data(ratings_path: str, books_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
19 |
+
ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';')
|
20 |
+
ratings = ratings[ratings['Book-Rating'] != 0]
|
21 |
+
books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
|
22 |
+
return ratings, books
|
23 |
+
|
24 |
+
|
25 |
+
def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
|
26 |
+
dataset = pd.merge(ratings, books, on=['ISBN'])
|
27 |
+
return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
|
28 |
+
|
29 |
+
|
30 |
+
def get_books_to_compare(data: pd.DataFrame, min_ratings: int = 8) -> List[str]:
|
31 |
+
book_ratings = data.groupby('Book-Title')['User-ID'].count()
|
32 |
+
return book_ratings[book_ratings >= min_ratings].index.tolist()
|
33 |
+
|
34 |
+
|
35 |
+
def prepare_correlation_dataset(data: pd.DataFrame, books_to_compare: List[str]) -> pd.DataFrame:
|
36 |
+
ratings_data = data.loc[data['Book-Title'].isin(books_to_compare), ['User-ID', 'Book-Rating', 'Book-Title']]
|
37 |
+
ratings_mean = ratings_data.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean().reset_index()
|
38 |
+
return ratings_mean.pivot(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
|
39 |
+
|
40 |
+
|
41 |
+
def build_faiss_index(data: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
|
42 |
+
transposed_data = data.T.values
|
43 |
+
normalized_data = transposed_data / np.linalg.norm(transposed_data, axis=1)[:, np.newaxis]
|
44 |
+
|
45 |
+
index_file = "books.index"
|
46 |
+
if os.path.exists(index_file):
|
47 |
+
return read_index(index_file), normalized_data
|
48 |
+
|
49 |
+
dimension = normalized_data.shape[1]
|
50 |
+
index = faiss.IndexFlatIP(dimension)
|
51 |
+
index.add(normalized_data.astype('float32'))
|
52 |
+
write_index(index, index_file)
|
53 |
+
return index, normalized_data
|
54 |
+
|
55 |
+
|
56 |
+
def compute_correlations_faiss(index: faiss.IndexFlatIP, data: np.ndarray, book_titles: List[str],
|
57 |
+
target_book: str) -> pd.DataFrame:
|
58 |
+
target_index = book_titles.index(target_book)
|
59 |
+
target_vector = data[target_index].reshape(1, -1)
|
60 |
+
k = len(book_titles)
|
61 |
+
similarities, I = index.search(target_vector.astype('float32'), k)
|
62 |
+
avg_ratings = np.mean(data, axis=1)
|
63 |
+
corr_df = pd.DataFrame({
|
64 |
+
'book': [book_titles[i] for i in I[0]],
|
65 |
+
'corr': similarities[0],
|
66 |
+
'avg_rating': avg_ratings[I[0]]
|
67 |
+
})
|
68 |
+
return corr_df.sort_values('corr', ascending=False)
|
69 |
+
|
70 |
+
|
71 |
+
def load_and_prepare_data():
|
72 |
+
global dataset, faiss_index, normalized_data, book_titles
|
73 |
+
|
74 |
+
# Download data files from Hugging Face
|
75 |
+
ratings_file = "BX-Book-Ratings.csv"
|
76 |
+
books_file = "BX-Books.csv"
|
77 |
+
|
78 |
+
ratings, books = load_data(ratings_file, books_file)
|
79 |
+
dataset = preprocess_data(ratings, books)
|
80 |
+
books_to_compare = get_books_to_compare(dataset)
|
81 |
+
correlation_dataset = prepare_correlation_dataset(dataset, books_to_compare)
|
82 |
+
faiss_index, normalized_data = build_faiss_index(correlation_dataset)
|
83 |
+
book_titles = correlation_dataset.columns.tolist()
|
84 |
+
|
85 |
+
|
86 |
+
def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
|
87 |
+
global dataset, faiss_index, normalized_data, book_titles
|
88 |
+
|
89 |
+
if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
|
90 |
+
load_and_prepare_data()
|
91 |
+
|
92 |
+
target_book = target_book.lower()
|
93 |
+
# Fuzzy match the input to the closest book title
|
94 |
+
closest_match, score = process.extractOne(target_book, book_titles)
|
95 |
+
|
96 |
+
if score < 50: # You can adjust this threshold
|
97 |
+
return f"No close match found for '{target_book}'. Please try a different title."
|
98 |
+
|
99 |
+
if closest_match != target_book:
|
100 |
+
result = f"Closest match: '{closest_match}' (similarity: {score}%)\n\n"
|
101 |
+
else:
|
102 |
+
result = ""
|
103 |
+
|
104 |
+
correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)
|
105 |
+
|
106 |
+
recommendations = correlations[correlations['book'] != target_book].head(num_recommendations)
|
107 |
+
|
108 |
+
result = f"Top {num_recommendations} recommendations for '{target_book}':\n\n"
|
109 |
+
for i, (_, row) in enumerate(recommendations.iterrows(), 1):
|
110 |
+
result += f"{i}. {row['book']} (Correlation: {row['corr']:.2f})\n"
|
111 |
+
|
112 |
+
return result
|
113 |
+
|
114 |
+
|
115 |
+
# Create Gradio interface
|
116 |
+
iface = gr.Interface(
|
117 |
+
fn=recommend_books,
|
118 |
+
inputs=[
|
119 |
+
gr.Textbox(label="Enter a book title"),
|
120 |
+
gr.Slider(minimum=1, maximum=20, step=1, label="Number of recommendations", value=10)
|
121 |
+
],
|
122 |
+
outputs=gr.Textbox(label="Recommendations"),
|
123 |
+
title="Book Recommender",
|
124 |
+
description="Enter a book title to get recommendations based on user ratings and book similarities."
|
125 |
+
)
|
126 |
+
|
127 |
+
# Launch the app
|
128 |
+
iface.launch(share=True)
|