import streamlit as st import pytesseract from PIL import Image import fitz import io import requests from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from ollama import Client client = Client(host='http://localhost:11434') def find_most_relevant_context(contexts, question, max_features=10000): # Vectorize contexts and question with limited features tfidf_vectorizer = TfidfVectorizer(max_features=max_features) tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts) # Compute cosine similarity between question and contexts similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() # Get index of context with highest similarity most_relevant_index = similarity_scores.argmax() return contexts[most_relevant_index] ollama_url = "http://localhost:11434/api/generate" ollama_headers = {"Content-Type": "application/json"} def query(payload): response = requests.post(ollama_url, headers=ollama_headers, json=payload) return response.json() # Mock function for answering questions from the PDF # Replace this with your actual backend function def answer_question_from_pdf(pdf_text, question): # This function should return the answer to the question based on the PDF content # Here we just return a mock response return (client.chat( model='mixtral:8x7b', messages=[ { 'role': 'user', 'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100", }, ])) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): # Open the PDF file pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") pdf_arr = [] # Iterate through each page for page_num in range(len(pdf_document)): # Get the page page = pdf_document.load_page(page_num) # Get the page as an image pix = page.get_pixmap() img = Image.open(io.BytesIO(pix.tobytes())) # Perform OCR on the image pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' pdf_text = pytesseract.image_to_string(img) pdf_arr.append(pdf_text) return pdf_arr # Streamlit app st.title("PDF Explorer") # File uploader uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file is not None: # Extract text from uploaded PDF pdf_arr = extract_text_from_pdf(uploaded_file) st.write("PDF Uploaded Successfully.") # Text area for entering a question question = st.text_input("Ask a question about the PDF") pdf_text = find_most_relevant_context(pdf_arr, question) if st.button("Get Answer"): if question: # Get the answer from the backend answer = answer_question_from_pdf(pdf_text, question) st.write("Answer:", answer) else: st.write("Please enter a question.") else: st.write("Please upload a PDF file.")