Spaces:
Sleeping
Sleeping
import os | |
import re | |
import pandas as pd | |
import numpy as np | |
from typing import List, Tuple | |
import faiss | |
from faiss import write_index, read_index | |
import gradio as gr | |
from fuzzywuzzy import process | |
from pandas import DataFrame | |
from tqdm import tqdm | |
from transformers import BertTokenizerFast, BertModel, AutoTokenizer, AutoModel | |
# Global variables to store loaded data | |
dataset = None | |
faiss_index = None | |
normalized_data = None | |
book_titles = None | |
def is_valid_isbn(isbn): | |
pattern = r'^(?:(?:978|979)\d{10}|\d{9}[0-9X])$' | |
return bool(re.match(pattern, isbn)) | |
def load_data(ratings_path, books_path) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';', on_bad_lines='skip') | |
ratings = ratings[ratings['Book-Rating'] != 0] | |
books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip') | |
return ratings, books | |
def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame: | |
dataset = pd.merge(ratings, books, on=['ISBN']) | |
return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x) | |
def create_embedding(dataset): | |
model_name = "mrm8488/bert-tiny-finetuned-sms-spam-detection" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModel.from_pretrained(model_name) | |
print("creating tokens") | |
tokens = [tokenizer(i, padding="max_length", truncation=True, max_length=10, return_tensors='pt') | |
for i in dataset] | |
print("\ncreating embedding\n") | |
emb = [] | |
for i in tqdm(tokens): | |
emb.append(model(**i,)["last_hidden_state"].detach().numpy().squeeze().reshape(-1)) | |
# Normalize the data | |
normalized_data = emb / np.linalg.norm(emb) | |
return normalized_data | |
def build_faiss_index(dataset: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]: | |
if os.path.exists("books.index"): | |
return read_index("books.index") | |
dataset["embedding"] = create_embedding(dataset["Book-Title"]) | |
print("creating index") | |
normalized_data = dataset["embedding"] | |
# Create a Faiss index | |
dimension = normalized_data.shape[-1] | |
index = faiss.IndexFlatIP(dimension) | |
# Add vectors to the index | |
index.add(normalized_data.astype('float16')) | |
write_index(index, "data/books.index") | |
return index | |
def compute_correlations_faiss(index: faiss.IndexFlatIP, book_titles: List[str], | |
target_book, ) -> pd.DataFrame: | |
print(target_book, type(target_book)) | |
emb = create_embedding([target_book[0]]) | |
# target_vector = book_titles.index(emb) | |
# Perform the search | |
k = len(book_titles) # Search for all books | |
similarities, I = index.search(emb.astype('float16'), k) | |
# # Reduce database and query vectors to 2D for visualization | |
# pca = PCA(n_components=2) | |
# reduced_db = pca.fit_transform(data) | |
# reduced_query = pca.transform(target_vector) | |
# | |
# # Scatter plot | |
# plt.scatter(reduced_db[:, 0], reduced_db[:, 1], label='Database Vectors', alpha=0.5) | |
# plt.scatter(reduced_query[:, 0], reduced_query[:, 1], label='Query Vectors', marker='X', color='red') | |
# plt.legend() | |
# plt.title("PCA Projection of IndexFlatIP Vectors") | |
# plt.show() | |
corr_df = pd.DataFrame({ | |
'book': [book_titles[i] for i in I[0]], | |
'corr': similarities[0] | |
}) | |
return corr_df.sort_values('corr', ascending=False) | |
def load_and_prepare_data(): | |
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn | |
# Download data files from Hugging Face | |
ratings = "BX-Book-Ratings.csv" | |
books = "BX-Books.csv" | |
ratings, books = load_data(ratings, books) | |
dataset = preprocess_data(ratings, books) | |
ratings = ratings[ratings['ISBN'].apply(is_valid_isbn)] | |
dataset = dataset[dataset['ISBN'].apply(is_valid_isbn)] | |
ratings_by_isbn = ratings.drop(columns="User-ID")[ratings.drop(columns="User-ID")["Book-Rating"] > 0] | |
ratings_by_isbn = ratings_by_isbn.groupby('ISBN')["Book-Rating"].mean().reset_index() | |
ratings_by_isbn = ratings_by_isbn.drop_duplicates(subset=['ISBN']) | |
dataset = dataset.drop(columns=["User-ID", "Book-Rating"]) | |
dataset = dataset[dataset['ISBN'].isin(ratings_by_isbn['ISBN'])] | |
dataset = dataset.drop_duplicates(subset=['ISBN']) | |
dataset = preprocess_data(dataset, ratings_by_isbn) | |
# Build Faiss index | |
faiss_index = build_faiss_index(dataset) | |
book_titles = dataset["Book-Title"] | |
def recommend_books(target_book: str): | |
num_recommendations: int = 15 | |
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn | |
if dataset is None or faiss_index is None or normalized_data is None or book_titles is None: | |
load_and_prepare_data() | |
dataset['ISBN'] = dataset['ISBN'].str.strip() | |
print("Before dropping duplicates:", len(dataset)) | |
dataset = dataset.drop_duplicates(subset=['ISBN']) | |
print("After dropping duplicates:", len(dataset)) | |
target_book = target_book.lower() | |
# Fuzzy match the input to the closest book title | |
closest_match = process.extractOne(target_book, book_titles) | |
correlations = compute_correlations_faiss(faiss_index, book_titles, closest_match) | |
recommendations = correlations[correlations['book'] != target_book] | |
# Create a mask of unique ISBNs | |
unique_mask = dataset.duplicated(subset=['ISBN'], keep='first') == False | |
# Apply the mask | |
dataset = dataset[unique_mask] | |
recommendations = recommendations.head(num_recommendations) | |
dups = [] | |
result_df = pd.DataFrame([ | |
{ | |
"Title": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Title'].values[0], | |
"Author": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Author'].values[0], | |
"Year": dataset.loc[dataset['Book-Title'] == row['book'], 'Year-Of-Publication'].values[0], | |
"Publisher": dataset.loc[dataset['Book-Title'] == row['book'], 'Publisher'].values[0], | |
"ISBN": dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0], | |
"Rating": ratings_by_isbn.loc[ | |
ratings_by_isbn['ISBN'] == dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[ | |
0], 'Book-Rating'].values[0], | |
"none": dups.append(dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0]) | |
} | |
for idx, (_, row) in enumerate(recommendations.iterrows(), 1) | |
if dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0] not in dups | |
]) | |
return result_df | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=recommend_books, | |
inputs=[ | |
gr.Textbox(label="Enter a book title"), | |
], | |
outputs=[ | |
gr.Dataframe( | |
headers=["Title", "Author", "Year", "Publisher", "ISBN", "Rating"], | |
type="pandas", | |
) | |
], | |
title="Book Recommender", | |
description="Enter a book title to get recommendations based on user ratings and book similarities." | |
) | |
# Launch the app | |
iface.launch() |