import streamlit as st
import pytesseract
from PIL import Image
import fitz
import io

import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from ollama import Client

client = Client(host='http://localhost:11434')


def find_most_relevant_context(contexts, question, max_features=10000):
    # Vectorize contexts and question with limited features
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)

    # Compute cosine similarity between question and contexts
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Get index of context with highest similarity
    most_relevant_index = similarity_scores.argmax()

    return contexts[most_relevant_index]


ollama_url = "http://localhost:11434/api/generate"
ollama_headers = {"Content-Type": "application/json"}


def query(payload):
    response = requests.post(ollama_url, headers=ollama_headers, json=payload)
    return response.json()


# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
    # This function should return the answer to the question based on the PDF content
    # Here we just return a mock response

    return (client.chat(
        model='mixtral:8x7b',
        messages=[
            {
                'role': 'user',
                'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
            },
        ]))


# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    # Open the PDF file
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")

    pdf_arr = []

    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)

        # Get the page as an image
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))

        # Perform OCR on the image
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
        pdf_text = pytesseract.image_to_string(img)
        pdf_arr.append(pdf_text)

    return pdf_arr


# Streamlit app
st.title("PDF Explorer")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file is not None:
    # Extract text from uploaded PDF
    pdf_arr = extract_text_from_pdf(uploaded_file)

    st.write("PDF Uploaded Successfully.")

    # Text area for entering a question
    question = st.text_input("Ask a question about the PDF")
    pdf_text = find_most_relevant_context(pdf_arr, question)

    if st.button("Get Answer"):
        if question:
            # Get the answer from the backend
            answer = answer_question_from_pdf(pdf_text, question)
            st.write("Answer:", answer)
        else:
            st.write("Please enter a question.")
else:
    st.write("Please upload a PDF file.")