import gradio as gr
import os
from groq import Groq
from PyPDF2 import PdfReader
import re
from datasets import load_dataset

# Function to read the uploaded PDFs and return the text
def read_pdf_from_dataset(file_name):
    try:
        # Load the dataset containing the PDF files
        dataset = load_dataset("akazmi/legal-documents")
        
        # Get the content of the selected document
        document = dataset["train"][file_name]
        file_path = document["file"]
        
        # Read the PDF file content
        with open(file_path, "rb") as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

# Function to chunk large text for Groq model to avoid token limits
def chunk_text(text, chunk_size=3000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Function to perform document retrieval (find the relevant chunks)
def retrieve_relevant_document(user_question, document_text):
    text_chunks = chunk_text(document_text)
    
    # Find chunk with the highest relevance to the user's question
    relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
    return relevant_chunk

# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
def similarity(query, text):
    query_words = set(query.lower().split())
    text_words = set(text.lower().split())
    common_words = query_words.intersection(text_words)
    return len(common_words)

# Initialize Groq client
def initialize_groq():
    return Groq(api_key=os.getenv("GROQ_API_KEY"))

# Function to handle document selection and answer generation using RAG
def answer_question(selected_document, user_question):
    # Check if document is selected
    if selected_document is None:
        return "Please select a document before asking a question."

    # Read the content from the selected document
    document_text = read_pdf_from_dataset(selected_document)

    # If document text is empty, return an error message
    if not document_text:
        return "Error: The document content is empty or could not be extracted."

    # Perform document retrieval: get the most relevant chunk
    relevant_chunk = retrieve_relevant_document(user_question, document_text)

    # Prepare the query for the model, including the relevant chunk of text
    query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"

    # Initialize Groq client
    client = initialize_groq()

    try:
        # Generate the answer from the Groq model
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": query}],
            model="llama3-8b-8192",  # Use your chosen model
        )
        # Return the model's response
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error generating answer: {str(e)}"

# Create Gradio Interface
def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Ask questions based on the selected document")
        
        # Dropdown to select the document
        document_dropdown = gr.Dropdown(
            label="Select Document", 
            choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"], 
            value="Income Tax Ordinance.pdf"
        )
        
        # Input for the user's question
        question_input = gr.Textbox(
            label="Enter your question", 
            placeholder="Ask something related to the selected document..."
        )
        
        # Output area for the answer
        answer_output = gr.Textbox(label="Answer", interactive=False)

        # Button to submit the question and get the answer
        submit_button = gr.Button("Ask")

        submit_button.click(
            fn=answer_question,
            inputs=[document_dropdown, question_input],
            outputs=answer_output
        )

    return demo

# Run the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()