# Import necessary libraries import os import PyPDF2 from langchain.text_splitter import CharacterTextSplitter from sentence_transformers import SentenceTransformer import chromadb from chromadb.utils import embedding_functions from transformers import pipeline import gradio as gr # Step 1: Extract text from uploaded PDF def extract_text_from_pdf(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Step 2: Chunk the text def chunk_text(text, chunk_size=500, overlap=50): splitter = CharacterTextSplitter( separator=" ", chunk_size=chunk_size, chunk_overlap=overlap, length_function=len ) chunks = splitter.split_text(text) return chunks # Step 3: Generate embeddings def generate_embeddings(chunks): model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode(chunks, show_progress_bar=False) return embeddings # Step 4: Store embeddings in a retriever def create_retriever(chunks, embeddings): client = chromadb.Client() collection = client.create_collection("pdf_chunks") for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): collection.add( ids=[str(i)], documents=[chunk], embeddings=[embedding] ) return collection # Step 5: Answer questions using RAG def answer_question(question, retriever, embedding_model): query_embedding = embedding_model.encode([question])[0] results = retriever.query(query_embeddings=[query_embedding], n_results=3) retrieved_docs = [doc["document"] for doc in results] # Combine the retrieved chunks for context context = " ".join(retrieved_docs) # Use a language model to answer the question qa_model = pipeline("text2text-generation", model="google/flan-t5-base") answer = qa_model(f"Context: {context} Question: {question}", max_length=200)[0]['generated_text'] return answer # Define the main function for the app def process_pdf_and_answer_question(pdf_file, question): # Extract text from the uploaded PDF text = extract_text_from_pdf(pdf_file) # Chunk the text chunks = chunk_text(text) # Generate embeddings embeddings = generate_embeddings(chunks) # Create retriever retriever = create_retriever(chunks, embeddings) # Load embedding model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Answer the question answer = answer_question(question, retriever, embedding_model) return answer # Gradio interface with gr.Blocks() as app: gr.Markdown("# PDF Question Answering with RAG") with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) question_input = gr.Textbox(label="Enter your question", placeholder="What do you want to know?") answer_output = gr.Textbox(label="Answer") submit_button = gr.Button("Get Answer") submit_button.click( process_pdf_and_answer_question, inputs=[pdf_input, question_input], outputs=answer_output ) # Run the app if __name__ == "__main__": app.launch()