import streamlit as st import pdfplumber import numpy as np from sentence_transformers import SentenceTransformer, util from huggingface_hub import HfApi, ModelRepository import os # Set up HuggingFace API token HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN'] # Set up model and vector database model_name = "google/flan-t5-xl" model_kwargs = {"temperature": 0.2, "max_length": 100} model = HfApi().model_info(model_name) repo = ModelRepository(model_name, token=HUGGINGFACEHUB_API_TOKEN) model = repo.load_model(**model_kwargs) # Set up vector database vector_db = SentenceTransformer('all-MiniLM-L6-v2') # Function to extract text from PDF documents def extract_text_from_pdfs(pdfs): texts = [] for pdf in pdfs: with pdfplumber.open(pdf) as pdf_file: for page in pdf_file.pages: texts.append(page.extract_text()) return''.join(texts) # Function to split text into chunks def split_text_into_chunks(text): chunks = [] for i in range(0, len(text), 9000): chunk = text[i:i+10000] chunks.append(chunk) return chunks # Function to create and save vector database def create_vector_db(chunks): vectors = [] for chunk in chunks: vector = vector_db.encode(chunk) vectors.append(vector) np.save('vector_db.npy', vectors) # Function to create conversational chain def create_conversational_chain(chunks, question): prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information.""" responses = [] for chunk in chunks: prompt = prompt_template.format(context=chunk, question=question) response = model.generate(prompt, **model_kwargs) responses.append(response) return responses # Streamlit UI creation st.title("PDF Chatbot") st.write("Upload multiple PDF files and ask a question to get a response based on the content of the PDFs.") pdfs = st.file_uploader("Select PDF files", type=["pdf"], accept_multiple_files=True) question = st.text_input("Enter your question") if st.button("Get Response"): # Extract text from PDFs text = extract_text_from_pdfs(pdfs) # Split text into chunks chunks = split_text_into_chunks(text) # Create and save vector database create_vector_db(chunks) # Create conversational chain responses = create_conversational_chain(chunks, question) # Display responses for response in responses: st.write(response)