pdf_reader / app.py
ARKamaliD's picture
adjusted to my local ollama instance
c83b1e7
raw
history blame contribute delete
No virus
3.14 kB
import streamlit as st
import pytesseract
from PIL import Image
import fitz
import io
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ollama import Client
client = Client(host='http://localhost:11434')
def find_most_relevant_context(contexts, question, max_features=10000):
# Vectorize contexts and question with limited features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
# Compute cosine similarity between question and contexts
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
# Get index of context with highest similarity
most_relevant_index = similarity_scores.argmax()
return contexts[most_relevant_index]
ollama_url = "http://localhost:11434/api/generate"
ollama_headers = {"Content-Type": "application/json"}
def query(payload):
response = requests.post(ollama_url, headers=ollama_headers, json=payload)
return response.json()
# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
# This function should return the answer to the question based on the PDF content
# Here we just return a mock response
return (client.chat(
model='mixtral:8x7b',
messages=[
{
'role': 'user',
'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
},
]))
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
# Open the PDF file
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
pdf_arr = []
# Iterate through each page
for page_num in range(len(pdf_document)):
# Get the page
page = pdf_document.load_page(page_num)
# Get the page as an image
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes()))
# Perform OCR on the image
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pdf_text = pytesseract.image_to_string(img)
pdf_arr.append(pdf_text)
return pdf_arr
# Streamlit app
st.title("PDF Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Extract text from uploaded PDF
pdf_arr = extract_text_from_pdf(uploaded_file)
st.write("PDF Uploaded Successfully.")
# Text area for entering a question
question = st.text_input("Ask a question about the PDF")
pdf_text = find_most_relevant_context(pdf_arr, question)
if st.button("Get Answer"):
if question:
# Get the answer from the backend
answer = answer_question_from_pdf(pdf_text, question)
st.write("Answer:", answer)
else:
st.write("Please enter a question.")
else:
st.write("Please upload a PDF file.")