import gradio as gr import os from groq import Groq from PyPDF2 import PdfReader import re from datasets import load_dataset # Function to read the uploaded PDFs and return the text def read_pdf_from_dataset(file_name): try: # Load the dataset containing the PDF files dataset = load_dataset("akazmi/legal-documents") # Get the content of the selected document document = dataset["train"][file_name] file_path = document["file"] # Read the PDF file content with open(file_path, "rb") as file: reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text except Exception as e: return f"Error reading PDF: {str(e)}" # Function to chunk large text for Groq model to avoid token limits def chunk_text(text, chunk_size=3000): chunks = [] for i in range(0, len(text), chunk_size): chunks.append(text[i:i + chunk_size]) return chunks # Function to perform document retrieval (find the relevant chunks) def retrieve_relevant_document(user_question, document_text): text_chunks = chunk_text(document_text) # Find chunk with the highest relevance to the user's question relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk)) return relevant_chunk # A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings) def similarity(query, text): query_words = set(query.lower().split()) text_words = set(text.lower().split()) common_words = query_words.intersection(text_words) return len(common_words) # Initialize Groq client def initialize_groq(): return Groq(api_key=os.getenv("GROQ_API_KEY")) # Function to handle document selection and answer generation using RAG def answer_question(selected_document, user_question): # Check if document is selected if selected_document is None: return "Please select a document before asking a question." # Read the content from the selected document document_text = read_pdf_from_dataset(selected_document) # If document text is empty, return an error message if not document_text: return "Error: The document content is empty or could not be extracted." # Perform document retrieval: get the most relevant chunk relevant_chunk = retrieve_relevant_document(user_question, document_text) # Prepare the query for the model, including the relevant chunk of text query = f"{user_question} \n\n Relevant Document: {relevant_chunk}" # Initialize Groq client client = initialize_groq() try: # Generate the answer from the Groq model chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": query}], model="llama3-8b-8192", # Use your chosen model ) # Return the model's response return chat_completion.choices[0].message.content except Exception as e: return f"Error generating answer: {str(e)}" # Create Gradio Interface def create_interface(): with gr.Blocks() as demo: gr.Markdown("### Ask questions based on the selected document") # Dropdown to select the document document_dropdown = gr.Dropdown( label="Select Document", choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"], value="Income Tax Ordinance.pdf" ) # Input for the user's question question_input = gr.Textbox( label="Enter your question", placeholder="Ask something related to the selected document..." ) # Output area for the answer answer_output = gr.Textbox(label="Answer", interactive=False) # Button to submit the question and get the answer submit_button = gr.Button("Ask") submit_button.click( fn=answer_question, inputs=[document_dropdown, question_input], outputs=answer_output ) return demo # Run the interface if __name__ == "__main__": demo = create_interface() demo.launch()