pdf_reader

Paused

File size: 3,141 Bytes

a8bff12
6a855ac
 
3497391
a8bff12
fa99cd6
0d5476d
 
 
 
ef0480b
c83b1e7
 
 
 
ef0480b
0d5476d
 
 
 
c83b1e7
0d5476d
 
c83b1e7
0d5476d
 
 
c83b1e7
0d5476d
 
c83b1e7
 
0d5476d
 
 
c83b1e7
 
0d5476d
 
a8bff12
 
 
 
 
c83b1e7
 
 
 
 
 
 
 
 
 
a8bff12
 
 
66f696e
441206e
c83b1e7
0d5476d
c83b1e7
66f696e
 
 
 
c83b1e7
66f696e
 
 
c83b1e7
66f696e
c83b1e7
66f696e
0d5476d
c83b1e7
0d5476d
c83b1e7
 
a8bff12
 
 
 
 
 
 
 
0d5476d
c83b1e7
a8bff12
c83b1e7
a8bff12
 
c83b1e7
 
a8bff12

import streamlit as st
import pytesseract
from PIL import Image
import fitz
import io

import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from ollama import Client

client = Client(host='http://localhost:11434')


def find_most_relevant_context(contexts, question, max_features=10000):
    # Vectorize contexts and question with limited features
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)

    # Compute cosine similarity between question and contexts
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Get index of context with highest similarity
    most_relevant_index = similarity_scores.argmax()

    return contexts[most_relevant_index]


ollama_url = "http://localhost:11434/api/generate"
ollama_headers = {"Content-Type": "application/json"}


def query(payload):
    response = requests.post(ollama_url, headers=ollama_headers, json=payload)
    return response.json()


# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
    # This function should return the answer to the question based on the PDF content
    # Here we just return a mock response

    return (client.chat(
        model='mixtral:8x7b',
        messages=[
            {
                'role': 'user',
                'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
            },
        ]))


# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    # Open the PDF file
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")

    pdf_arr = []

    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)

        # Get the page as an image
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))

        # Perform OCR on the image
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
        pdf_text = pytesseract.image_to_string(img)
        pdf_arr.append(pdf_text)

    return pdf_arr


# Streamlit app
st.title("PDF Explorer")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file is not None:
    # Extract text from uploaded PDF
    pdf_arr = extract_text_from_pdf(uploaded_file)

    st.write("PDF Uploaded Successfully.")

    # Text area for entering a question
    question = st.text_input("Ask a question about the PDF")
    pdf_text = find_most_relevant_context(pdf_arr, question)

    if st.button("Get Answer"):
        if question:
            # Get the answer from the backend
            answer = answer_question_from_pdf(pdf_text, question)
            st.write("Answer:", answer)
        else:
            st.write("Please enter a question.")
else:
    st.write("Please upload a PDF file.")