import pandas as pd from sentence_transformers import SentenceTransformer import faiss import numpy as np import streamlit as st import pickle import os # Load CSV file csv_file = "Hydra-Movie-Scrape.csv" df = pd.read_csv(csv_file) # Load the precomputed embeddings with open("embeddings.pkl", "rb") as f: doc_embeddings = pickle.load(f) # Convert the embeddings into a NumPy array (FAISS requires float32) embedding_matrix = np.array(doc_embeddings).astype("float32") # Build a FAISS index for efficient similarity search index = faiss.IndexFlatL2(embedding_matrix.shape[1]) index.add(embedding_matrix) # Load the SentenceTransformer model for encoding the query model = SentenceTransformer('all-MiniLM-L6-v2') # Function to retrieve the most relevant movies based on a query def retrieve(query, top_k=10): query_embedding = model.encode(query) query_vector = np.array(query_embedding).astype("float32") distances, indices = index.search(np.array([query_vector]), top_k) return indices[0] # Streamlit app layout st.title("Movie Dataset RAG Application") query = st.text_input("Ask a question about movies:") if st.button("Submit"): if query: indices = retrieve(query) response = "" for idx in indices: if idx != -1: movie_details = df.iloc[idx] response += f"*Title*: {movie_details['Title']}\n" response += f"*Year*: {movie_details['Year']}\n" response += f"*Director*: {movie_details['Director']}\n" response += f"*Cast*: {movie_details['Cast']}\n" response += f"*Summary*: {movie_details['Summary']}\n\n" if response: st.write("Here are some movies that match your query:") st.markdown(response) else: st.write("No relevant documents found.") else: st.write("Please enter a query.")