Spaces:
Paused
Paused
import streamlit as st | |
import pytesseract | |
from PIL import Image | |
import fitz | |
import io | |
import requests | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from ollama import Client | |
client = Client(host='http://localhost:11434') | |
def find_most_relevant_context(contexts, question, max_features=10000): | |
# Vectorize contexts and question with limited features | |
tfidf_vectorizer = TfidfVectorizer(max_features=max_features) | |
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts) | |
# Compute cosine similarity between question and contexts | |
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() | |
# Get index of context with highest similarity | |
most_relevant_index = similarity_scores.argmax() | |
return contexts[most_relevant_index] | |
ollama_url = "http://localhost:11434/api/generate" | |
ollama_headers = {"Content-Type": "application/json"} | |
def query(payload): | |
response = requests.post(ollama_url, headers=ollama_headers, json=payload) | |
return response.json() | |
# Mock function for answering questions from the PDF | |
# Replace this with your actual backend function | |
def answer_question_from_pdf(pdf_text, question): | |
# This function should return the answer to the question based on the PDF content | |
# Here we just return a mock response | |
return (client.chat( | |
model='mixtral:8x7b', | |
messages=[ | |
{ | |
'role': 'user', | |
'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100", | |
}, | |
])) | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
# Open the PDF file | |
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
pdf_arr = [] | |
# Iterate through each page | |
for page_num in range(len(pdf_document)): | |
# Get the page | |
page = pdf_document.load_page(page_num) | |
# Get the page as an image | |
pix = page.get_pixmap() | |
img = Image.open(io.BytesIO(pix.tobytes())) | |
# Perform OCR on the image | |
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
pdf_text = pytesseract.image_to_string(img) | |
pdf_arr.append(pdf_text) | |
return pdf_arr | |
# Streamlit app | |
st.title("PDF Explorer") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
if uploaded_file is not None: | |
# Extract text from uploaded PDF | |
pdf_arr = extract_text_from_pdf(uploaded_file) | |
st.write("PDF Uploaded Successfully.") | |
# Text area for entering a question | |
question = st.text_input("Ask a question about the PDF") | |
pdf_text = find_most_relevant_context(pdf_arr, question) | |
if st.button("Get Answer"): | |
if question: | |
# Get the answer from the backend | |
answer = answer_question_from_pdf(pdf_text, question) | |
st.write("Answer:", answer) | |
else: | |
st.write("Please enter a question.") | |
else: | |
st.write("Please upload a PDF file.") | |