import streamlit as st import torch import numpy as np from transformers import BertTokenizer, BertModel import pdfplumber from sklearn.metrics.pairwise import cosine_similarity # Load the pre-trained BERT model and tokenizer once model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(model_name) model = BertModel.from_pretrained(model_name) # Function to get BERT embeddings def get_embeddings(text): # Check if input text is empty if not text.strip(): raise ValueError("Input text is empty.") # Ensure that text length does not exceed BERT's maximum input length inputs = tokenizer.encode_plus( text, add_special_tokens=True, max_length=512, truncation=True, # This will truncate the text to the maximum length return_attention_mask=True, return_tensors='pt' ) with torch.no_grad(): # Disable gradient calculation for inference outputs = model(**inputs) # Extract the embeddings from the last hidden state if hasattr(outputs, 'last_hidden_state'): return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy() # Move to CPU before converting to numpy else: raise ValueError("Model output does not contain 'last_hidden_state'.") # Extract text from PDF def extract_text_from_pdf(pdf_file): with pdfplumber.open(pdf_file) as pdf: text = "" for page in pdf.pages: page_text = page.extract_text() if page_text: # Check if page text is not empty text += page_text + "\n" # Add newline for better separation else: st.warning("No extractable text found on a page.") return text # Split text into sentences for better matching def split_text_into_sentences(text): return text.split('\n') # Split by newlines; adjust as needed # Streamlit app st.title("PDF Chatbot using BERT") # PDF file upload pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"]) # Store the PDF text and embeddings pdf_text = "" pdf_embeddings = None if pdf_file: pdf_text = extract_text_from_pdf(pdf_file) # Check if the extracted text is empty if not pdf_text.strip(): st.error("The extracted PDF text is empty. Please upload a PDF with extractable text.") else: try: pdf_sentences = split_text_into_sentences(pdf_text) # Split PDF text into sentences pdf_embeddings = np.array([get_embeddings(sentence) for sentence in pdf_sentences]) # Get embeddings for each sentence st.success("PDF loaded successfully!") except Exception as e: st.error(f"Error while processing PDF: {e}") # User input for chatbot user_input = st.text_input("Ask a question about the PDF:") if st.button("Get Response"): if not pdf_sentences: st.warning("Please upload a PDF file first.") elif not user_input.strip(): st.warning("Please enter a question.") else: try: user_embeddings = get_embeddings(user_input) user_embeddings = user_embeddings.reshape(1, -1) # Reshape for cosine similarity calculation # Calculate cosine similarity between user input and PDF sentence embeddings similarities = cosine_similarity(user_embeddings, pdf_embeddings) best_match_index = np.argmax(similarities) # Get the index of the best match # Display the most relevant sentence st.write("### Response:") st.write(pdf_sentences[best_match_index]) # Return the most relevant sentence except Exception as e: st.error(f"Error while processing user input: {e}")