pdf_reader

Paused

App Files Files Community

ARKamaliD commited on May 24

Commit

c83b1e7

•

1 Parent(s): fa99cd6

adjusted to my local ollama instance

Browse files

Files changed (1) hide show

app.py +33 -28

app.py CHANGED Viewed

@@ -1,46 +1,40 @@
 import streamlit as st
-from PyPDF2 import PdfReader
 import pytesseract
 from PIL import Image
 import fitz
 import io
 import requests
-import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-my_token = os.getenv('my_repo_token')
 def find_most_relevant_context(contexts, question, max_features=10000):
     # Vectorize contexts and question with limited features
     tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
     tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
     # Compute cosine similarity between question and contexts
     similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
     # Get index of context with highest similarity
     most_relevant_index = similarity_scores.argmax()
-    return contexts[most_relevant_index]
-API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
-headers = {"Authorization": f"Bearer {my_token}"}
 def query(payload):
-	response = requests.post(API_URL, headers=headers, json=payload)
-	return response.json()
 # Mock function for answering questions from the PDF
@@ -48,30 +42,41 @@ def query(payload):
 def answer_question_from_pdf(pdf_text, question):
     # This function should return the answer to the question based on the PDF content
     # Here we just return a mock response
-    return query(   {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 100",})
 # Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
     # Open the PDF file
     pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
     pdf_arr = []
     # Iterate through each page
     for page_num in range(len(pdf_document)):
         # Get the page
         page = pdf_document.load_page(page_num)
         # Get the page as an image
         pix = page.get_pixmap()
         img = Image.open(io.BytesIO(pix.tobytes()))
         # Perform OCR on the image
         pdf_text = pytesseract.image_to_string(img)
         pdf_arr.append(pdf_text)
     return pdf_arr
 # Streamlit app
 st.title("PDF Explorer")
@@ -81,13 +86,13 @@ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
 if uploaded_file is not None:
     # Extract text from uploaded PDF
     pdf_arr = extract_text_from_pdf(uploaded_file)
     st.write("PDF Uploaded Successfully.")
     # Text area for entering a question
     question = st.text_input("Ask a question about the PDF")
-    pdf_text = find_most_relevant_context(pdf_arr,question)
     if st.button("Get Answer"):
         if question:
             # Get the answer from the backend

 import streamlit as st
 import pytesseract
 from PIL import Image
 import fitz
 import io
 import requests
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from ollama import Client
+client = Client(host='http://localhost:11434')
 def find_most_relevant_context(contexts, question, max_features=10000):
     # Vectorize contexts and question with limited features
     tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
     tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
     # Compute cosine similarity between question and contexts
     similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
     # Get index of context with highest similarity
     most_relevant_index = similarity_scores.argmax()
+    return contexts[most_relevant_index]
+ollama_url = "http://localhost:11434/api/generate"
+ollama_headers = {"Content-Type": "application/json"}
 def query(payload):
+    response = requests.post(ollama_url, headers=ollama_headers, json=payload)
+    return response.json()
 # Mock function for answering questions from the PDF
 def answer_question_from_pdf(pdf_text, question):
     # This function should return the answer to the question based on the PDF content
     # Here we just return a mock response
+    return (client.chat(
+        model='mixtral:8x7b',
+        messages=[
+            {
+                'role': 'user',
+                'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
+            },
+        ]))
 # Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
     # Open the PDF file
     pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
     pdf_arr = []
     # Iterate through each page
     for page_num in range(len(pdf_document)):
         # Get the page
         page = pdf_document.load_page(page_num)
         # Get the page as an image
         pix = page.get_pixmap()
         img = Image.open(io.BytesIO(pix.tobytes()))
         # Perform OCR on the image
+        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
         pdf_text = pytesseract.image_to_string(img)
         pdf_arr.append(pdf_text)
     return pdf_arr
 # Streamlit app
 st.title("PDF Explorer")
 if uploaded_file is not None:
     # Extract text from uploaded PDF
     pdf_arr = extract_text_from_pdf(uploaded_file)
     st.write("PDF Uploaded Successfully.")
     # Text area for entering a question
     question = st.text_input("Ask a question about the PDF")
+    pdf_text = find_most_relevant_context(pdf_arr, question)
     if st.button("Get Answer"):
         if question:
             # Get the answer from the backend