import streamlit as st
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import pdfplumber
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to get BERT embeddings
def get_embeddings(text):
    # Check if input text is empty
    if not text.strip():
        raise ValueError("Input text is empty.")

    # Ensure that text length does not exceed BERT's maximum input length
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,  # This will truncate the text to the maximum length
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)

    # Extract the embeddings from the last hidden state
    if hasattr(outputs, 'last_hidden_state'):
        return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Move to CPU before converting to numpy
    else:
        raise ValueError("Model output does not contain 'last_hidden_state'.")

# Extract text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Check if page text is not empty
                text += page_text + "\n"  # Add newline for better separation
            else:
                st.warning("No extractable text found on a page.")
    return text

# Split text into sentences for better matching
def split_text_into_sentences(text):
    return text.split('\n')  # Split by newlines; adjust as needed

# Streamlit app
st.title("PDF Chatbot using BERT")

# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])

# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None

if pdf_file:
    pdf_text = extract_text_from_pdf(pdf_file)
    
    # Check if the extracted text is empty
    if not pdf_text.strip():
        st.error("The extracted PDF text is empty. Please upload a PDF with extractable text.")
    else:
        try:
            pdf_sentences = split_text_into_sentences(pdf_text)  # Split PDF text into sentences
            pdf_embeddings = np.array([get_embeddings(sentence) for sentence in pdf_sentences])  # Get embeddings for each sentence
            st.success("PDF loaded successfully!")
        except Exception as e:
            st.error(f"Error while processing PDF: {e}")

# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")

if st.button("Get Response"):
    if not pdf_sentences:
        st.warning("Please upload a PDF file first.")
    elif not user_input.strip():
        st.warning("Please enter a question.")
    else:
        try:
            user_embeddings = get_embeddings(user_input)
            user_embeddings = user_embeddings.reshape(1, -1)  # Reshape for cosine similarity calculation
            
            # Calculate cosine similarity between user input and PDF sentence embeddings
            similarities = cosine_similarity(user_embeddings, pdf_embeddings)
            best_match_index = np.argmax(similarities)  # Get the index of the best match
            
            # Display the most relevant sentence
            st.write("### Response:")
            st.write(pdf_sentences[best_match_index])  # Return the most relevant sentence
            
        except Exception as e:
            st.error(f"Error while processing user input: {e}")