Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from transformers import CrossEncoder | |
import numpy as np | |
# Load the dataset | |
def load_dataset(): | |
# Load the Databricks Dolly 15K dataset | |
return pd.read_csv('dolly_15k.csv') | |
# Load models | |
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') | |
# Streamlit UI | |
st.title("Multi-Stage Text Retrieval Pipeline for QA") | |
question = st.text_input("Enter a question:") | |
if question: | |
dataset = load_dataset() | |
# Generate embeddings for the questions and the dataset passages | |
passages = dataset['response'].tolist() # Adjust this according to your dataset's structure | |
question_embedding = embedding_model.encode(question) | |
passage_embeddings = embedding_model.encode(passages) | |
# Retrieve top-k passages based on embeddings | |
top_k = 5 | |
similarities = np.inner(question_embedding, passage_embeddings) | |
top_k_indices = np.argsort(similarities)[-top_k:][::-1] | |
relevant_passages = [passages[i] for i in top_k_indices] | |
st.subheader("Relevant passages:") | |
for passage in relevant_passages: | |
st.write(passage) | |
# Re-ranking the passages | |
ranked_scores = ranking_model.predict([[question, passage] for passage in relevant_passages]) | |
ranked_passages = sorted(zip(relevant_passages, ranked_scores), key=lambda x: x[1], reverse=True) | |
st.subheader("Ranked passages:") | |
for passage, score in ranked_passages: | |
st.write(f"{passage} (Score: {score:.2f})") | |