danschnurp commited on
Commit
53f5531
·
verified ·
1 Parent(s): 78535dd

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. BX-Book-Ratings.csv +3 -0
  3. app.py +128 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ BX-Book-Ratings.csv filter=lfs diff=lfs merge=lfs -text
BX-Book-Ratings.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f359084e5f350151b61f3d07acbe3c6ff0e2bd057d88901fa60507449d0e0e57
3
+ size 30682276
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ from typing import List, Tuple
6
+ import faiss
7
+ from faiss import write_index, read_index
8
+ import gradio as gr
9
+ from fuzzywuzzy import process
10
+
11
+ # Global variables to store loaded data
12
+ dataset = None
13
+ faiss_index = None
14
+ normalized_data = None
15
+ book_titles = None
16
+
17
+
18
+ def load_data(ratings_path: str, books_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
19
+ ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';')
20
+ ratings = ratings[ratings['Book-Rating'] != 0]
21
+ books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
22
+ return ratings, books
23
+
24
+
25
+ def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
26
+ dataset = pd.merge(ratings, books, on=['ISBN'])
27
+ return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
28
+
29
+
30
+ def get_books_to_compare(data: pd.DataFrame, min_ratings: int = 8) -> List[str]:
31
+ book_ratings = data.groupby('Book-Title')['User-ID'].count()
32
+ return book_ratings[book_ratings >= min_ratings].index.tolist()
33
+
34
+
35
+ def prepare_correlation_dataset(data: pd.DataFrame, books_to_compare: List[str]) -> pd.DataFrame:
36
+ ratings_data = data.loc[data['Book-Title'].isin(books_to_compare), ['User-ID', 'Book-Rating', 'Book-Title']]
37
+ ratings_mean = ratings_data.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean().reset_index()
38
+ return ratings_mean.pivot(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
39
+
40
+
41
+ def build_faiss_index(data: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
42
+ transposed_data = data.T.values
43
+ normalized_data = transposed_data / np.linalg.norm(transposed_data, axis=1)[:, np.newaxis]
44
+
45
+ index_file = "books.index"
46
+ if os.path.exists(index_file):
47
+ return read_index(index_file), normalized_data
48
+
49
+ dimension = normalized_data.shape[1]
50
+ index = faiss.IndexFlatIP(dimension)
51
+ index.add(normalized_data.astype('float32'))
52
+ write_index(index, index_file)
53
+ return index, normalized_data
54
+
55
+
56
+ def compute_correlations_faiss(index: faiss.IndexFlatIP, data: np.ndarray, book_titles: List[str],
57
+ target_book: str) -> pd.DataFrame:
58
+ target_index = book_titles.index(target_book)
59
+ target_vector = data[target_index].reshape(1, -1)
60
+ k = len(book_titles)
61
+ similarities, I = index.search(target_vector.astype('float32'), k)
62
+ avg_ratings = np.mean(data, axis=1)
63
+ corr_df = pd.DataFrame({
64
+ 'book': [book_titles[i] for i in I[0]],
65
+ 'corr': similarities[0],
66
+ 'avg_rating': avg_ratings[I[0]]
67
+ })
68
+ return corr_df.sort_values('corr', ascending=False)
69
+
70
+
71
+ def load_and_prepare_data():
72
+ global dataset, faiss_index, normalized_data, book_titles
73
+
74
+ # Download data files from Hugging Face
75
+ ratings_file = "BX-Book-Ratings.csv"
76
+ books_file = "BX-Books.csv"
77
+
78
+ ratings, books = load_data(ratings_file, books_file)
79
+ dataset = preprocess_data(ratings, books)
80
+ books_to_compare = get_books_to_compare(dataset)
81
+ correlation_dataset = prepare_correlation_dataset(dataset, books_to_compare)
82
+ faiss_index, normalized_data = build_faiss_index(correlation_dataset)
83
+ book_titles = correlation_dataset.columns.tolist()
84
+
85
+
86
+ def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
87
+ global dataset, faiss_index, normalized_data, book_titles
88
+
89
+ if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
90
+ load_and_prepare_data()
91
+
92
+ target_book = target_book.lower()
93
+ # Fuzzy match the input to the closest book title
94
+ closest_match, score = process.extractOne(target_book, book_titles)
95
+
96
+ if score < 50: # You can adjust this threshold
97
+ return f"No close match found for '{target_book}'. Please try a different title."
98
+
99
+ if closest_match != target_book:
100
+ result = f"Closest match: '{closest_match}' (similarity: {score}%)\n\n"
101
+ else:
102
+ result = ""
103
+
104
+ correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)
105
+
106
+ recommendations = correlations[correlations['book'] != target_book].head(num_recommendations)
107
+
108
+ result = f"Top {num_recommendations} recommendations for '{target_book}':\n\n"
109
+ for i, (_, row) in enumerate(recommendations.iterrows(), 1):
110
+ result += f"{i}. {row['book']} (Correlation: {row['corr']:.2f})\n"
111
+
112
+ return result
113
+
114
+
115
+ # Create Gradio interface
116
+ iface = gr.Interface(
117
+ fn=recommend_books,
118
+ inputs=[
119
+ gr.Textbox(label="Enter a book title"),
120
+ gr.Slider(minimum=1, maximum=20, step=1, label="Number of recommendations", value=10)
121
+ ],
122
+ outputs=gr.Textbox(label="Recommendations"),
123
+ title="Book Recommender",
124
+ description="Enter a book title to get recommendations based on user ratings and book similarities."
125
+ )
126
+
127
+ # Launch the app
128
+ iface.launch(share=True)