danschnurp's picture
Upload app.py
db8d54f verified
import os
import re
import pandas as pd
import numpy as np
from typing import List, Tuple
import faiss
from faiss import write_index, read_index
import gradio as gr
from fuzzywuzzy import process
from pandas import DataFrame
from tqdm import tqdm
from transformers import BertTokenizerFast, BertModel, AutoTokenizer, AutoModel
# Global variables to store loaded data
dataset = None
faiss_index = None
normalized_data = None
book_titles = None
def is_valid_isbn(isbn):
pattern = r'^(?:(?:978|979)\d{10}|\d{9}[0-9X])$'
return bool(re.match(pattern, isbn))
def load_data(ratings_path, books_path) -> Tuple[pd.DataFrame, pd.DataFrame]:
ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';', on_bad_lines='skip')
ratings = ratings[ratings['Book-Rating'] != 0]
books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
return ratings, books
def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
dataset = pd.merge(ratings, books, on=['ISBN'])
return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
def create_embedding(dataset):
model_name = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("creating tokens")
tokens = [tokenizer(i, padding="max_length", truncation=True, max_length=10, return_tensors='pt')
for i in dataset]
print("\ncreating embedding\n")
emb = []
for i in tqdm(tokens):
emb.append(model(**i,)["last_hidden_state"].detach().numpy().squeeze().reshape(-1))
# Normalize the data
normalized_data = emb / np.linalg.norm(emb)
return normalized_data
def build_faiss_index(dataset: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
if os.path.exists("books.index"):
return read_index("books.index")
dataset["embedding"] = create_embedding(dataset["Book-Title"])
print("creating index")
normalized_data = dataset["embedding"]
# Create a Faiss index
dimension = normalized_data.shape[-1]
index = faiss.IndexFlatIP(dimension)
# Add vectors to the index
index.add(normalized_data.astype('float16'))
write_index(index, "data/books.index")
return index
def compute_correlations_faiss(index: faiss.IndexFlatIP, book_titles: List[str],
target_book, ) -> pd.DataFrame:
print(target_book, type(target_book))
emb = create_embedding([target_book[0]])
# target_vector = book_titles.index(emb)
# Perform the search
k = len(book_titles) # Search for all books
similarities, I = index.search(emb.astype('float16'), k)
# # Reduce database and query vectors to 2D for visualization
# pca = PCA(n_components=2)
# reduced_db = pca.fit_transform(data)
# reduced_query = pca.transform(target_vector)
#
# # Scatter plot
# plt.scatter(reduced_db[:, 0], reduced_db[:, 1], label='Database Vectors', alpha=0.5)
# plt.scatter(reduced_query[:, 0], reduced_query[:, 1], label='Query Vectors', marker='X', color='red')
# plt.legend()
# plt.title("PCA Projection of IndexFlatIP Vectors")
# plt.show()
corr_df = pd.DataFrame({
'book': [book_titles[i] for i in I[0]],
'corr': similarities[0]
})
return corr_df.sort_values('corr', ascending=False)
def load_and_prepare_data():
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn
# Download data files from Hugging Face
ratings = "BX-Book-Ratings.csv"
books = "BX-Books.csv"
ratings, books = load_data(ratings, books)
dataset = preprocess_data(ratings, books)
ratings = ratings[ratings['ISBN'].apply(is_valid_isbn)]
dataset = dataset[dataset['ISBN'].apply(is_valid_isbn)]
ratings_by_isbn = ratings.drop(columns="User-ID")[ratings.drop(columns="User-ID")["Book-Rating"] > 0]
ratings_by_isbn = ratings_by_isbn.groupby('ISBN')["Book-Rating"].mean().reset_index()
ratings_by_isbn = ratings_by_isbn.drop_duplicates(subset=['ISBN'])
dataset = dataset.drop(columns=["User-ID", "Book-Rating"])
dataset = dataset[dataset['ISBN'].isin(ratings_by_isbn['ISBN'])]
dataset = dataset.drop_duplicates(subset=['ISBN'])
dataset = preprocess_data(dataset, ratings_by_isbn)
# Build Faiss index
faiss_index = build_faiss_index(dataset)
book_titles = dataset["Book-Title"]
def recommend_books(target_book: str):
num_recommendations: int = 15
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn
if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
load_and_prepare_data()
dataset['ISBN'] = dataset['ISBN'].str.strip()
print("Before dropping duplicates:", len(dataset))
dataset = dataset.drop_duplicates(subset=['ISBN'])
print("After dropping duplicates:", len(dataset))
target_book = target_book.lower()
# Fuzzy match the input to the closest book title
closest_match = process.extractOne(target_book, book_titles)
correlations = compute_correlations_faiss(faiss_index, book_titles, closest_match)
recommendations = correlations[correlations['book'] != target_book]
# Create a mask of unique ISBNs
unique_mask = dataset.duplicated(subset=['ISBN'], keep='first') == False
# Apply the mask
dataset = dataset[unique_mask]
recommendations = recommendations.head(num_recommendations)
dups = []
result_df = pd.DataFrame([
{
"Title": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Title'].values[0],
"Author": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Author'].values[0],
"Year": dataset.loc[dataset['Book-Title'] == row['book'], 'Year-Of-Publication'].values[0],
"Publisher": dataset.loc[dataset['Book-Title'] == row['book'], 'Publisher'].values[0],
"ISBN": dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0],
"Rating": ratings_by_isbn.loc[
ratings_by_isbn['ISBN'] == dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[
0], 'Book-Rating'].values[0],
"none": dups.append(dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0])
}
for idx, (_, row) in enumerate(recommendations.iterrows(), 1)
if dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0] not in dups
])
return result_df
# Create Gradio interface
iface = gr.Interface(
fn=recommend_books,
inputs=[
gr.Textbox(label="Enter a book title"),
],
outputs=[
gr.Dataframe(
headers=["Title", "Author", "Year", "Publisher", "ISBN", "Rating"],
type="pandas",
)
],
title="Book Recommender",
description="Enter a book title to get recommendations based on user ratings and book similarities."
)
# Launch the app
iface.launch()