Spaces:
Paused
Paused
adjusted to my local ollama instance
Browse files
app.py
CHANGED
@@ -1,46 +1,40 @@
|
|
1 |
import streamlit as st
|
2 |
-
from PyPDF2 import PdfReader
|
3 |
import pytesseract
|
4 |
from PIL import Image
|
5 |
import fitz
|
6 |
import io
|
7 |
|
8 |
import requests
|
9 |
-
import os
|
10 |
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
my_token = os.getenv('my_repo_token')
|
16 |
def find_most_relevant_context(contexts, question, max_features=10000):
|
17 |
# Vectorize contexts and question with limited features
|
18 |
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
|
19 |
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
|
20 |
-
|
21 |
# Compute cosine similarity between question and contexts
|
22 |
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
|
23 |
-
|
24 |
# Get index of context with highest similarity
|
25 |
most_relevant_index = similarity_scores.argmax()
|
26 |
-
|
27 |
-
return contexts[most_relevant_index]
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
|
|
33 |
|
34 |
|
|
|
|
|
35 |
|
36 |
-
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
|
37 |
-
headers = {"Authorization": f"Bearer {my_token}"}
|
38 |
|
39 |
def query(payload):
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
|
46 |
# Mock function for answering questions from the PDF
|
@@ -48,30 +42,41 @@ def query(payload):
|
|
48 |
def answer_question_from_pdf(pdf_text, question):
|
49 |
# This function should return the answer to the question based on the PDF content
|
50 |
# Here we just return a mock response
|
51 |
-
|
52 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
# Function to extract text from PDF
|
55 |
def extract_text_from_pdf(pdf_file):
|
56 |
# Open the PDF file
|
57 |
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
58 |
-
|
59 |
pdf_arr = []
|
60 |
-
|
61 |
# Iterate through each page
|
62 |
for page_num in range(len(pdf_document)):
|
63 |
# Get the page
|
64 |
page = pdf_document.load_page(page_num)
|
65 |
-
|
66 |
# Get the page as an image
|
67 |
pix = page.get_pixmap()
|
68 |
img = Image.open(io.BytesIO(pix.tobytes()))
|
69 |
-
|
70 |
# Perform OCR on the image
|
|
|
71 |
pdf_text = pytesseract.image_to_string(img)
|
72 |
pdf_arr.append(pdf_text)
|
73 |
-
|
74 |
return pdf_arr
|
|
|
|
|
75 |
# Streamlit app
|
76 |
st.title("PDF Explorer")
|
77 |
|
@@ -81,13 +86,13 @@ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
|
|
81 |
if uploaded_file is not None:
|
82 |
# Extract text from uploaded PDF
|
83 |
pdf_arr = extract_text_from_pdf(uploaded_file)
|
84 |
-
|
85 |
st.write("PDF Uploaded Successfully.")
|
86 |
-
|
87 |
# Text area for entering a question
|
88 |
question = st.text_input("Ask a question about the PDF")
|
89 |
-
pdf_text = find_most_relevant_context(pdf_arr,question)
|
90 |
-
|
91 |
if st.button("Get Answer"):
|
92 |
if question:
|
93 |
# Get the answer from the backend
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import pytesseract
|
3 |
from PIL import Image
|
4 |
import fitz
|
5 |
import io
|
6 |
|
7 |
import requests
|
|
|
8 |
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
|
12 |
+
from ollama import Client
|
13 |
+
|
14 |
+
client = Client(host='http://localhost:11434')
|
15 |
+
|
16 |
|
|
|
17 |
def find_most_relevant_context(contexts, question, max_features=10000):
|
18 |
# Vectorize contexts and question with limited features
|
19 |
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
|
20 |
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
|
21 |
+
|
22 |
# Compute cosine similarity between question and contexts
|
23 |
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
|
24 |
+
|
25 |
# Get index of context with highest similarity
|
26 |
most_relevant_index = similarity_scores.argmax()
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
return contexts[most_relevant_index]
|
29 |
|
30 |
|
31 |
+
ollama_url = "http://localhost:11434/api/generate"
|
32 |
+
ollama_headers = {"Content-Type": "application/json"}
|
33 |
|
|
|
|
|
34 |
|
35 |
def query(payload):
|
36 |
+
response = requests.post(ollama_url, headers=ollama_headers, json=payload)
|
37 |
+
return response.json()
|
|
|
|
|
38 |
|
39 |
|
40 |
# Mock function for answering questions from the PDF
|
|
|
42 |
def answer_question_from_pdf(pdf_text, question):
|
43 |
# This function should return the answer to the question based on the PDF content
|
44 |
# Here we just return a mock response
|
45 |
+
|
46 |
+
return (client.chat(
|
47 |
+
model='mixtral:8x7b',
|
48 |
+
messages=[
|
49 |
+
{
|
50 |
+
'role': 'user',
|
51 |
+
'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
|
52 |
+
},
|
53 |
+
]))
|
54 |
+
|
55 |
|
56 |
# Function to extract text from PDF
|
57 |
def extract_text_from_pdf(pdf_file):
|
58 |
# Open the PDF file
|
59 |
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
60 |
+
|
61 |
pdf_arr = []
|
62 |
+
|
63 |
# Iterate through each page
|
64 |
for page_num in range(len(pdf_document)):
|
65 |
# Get the page
|
66 |
page = pdf_document.load_page(page_num)
|
67 |
+
|
68 |
# Get the page as an image
|
69 |
pix = page.get_pixmap()
|
70 |
img = Image.open(io.BytesIO(pix.tobytes()))
|
71 |
+
|
72 |
# Perform OCR on the image
|
73 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
74 |
pdf_text = pytesseract.image_to_string(img)
|
75 |
pdf_arr.append(pdf_text)
|
76 |
+
|
77 |
return pdf_arr
|
78 |
+
|
79 |
+
|
80 |
# Streamlit app
|
81 |
st.title("PDF Explorer")
|
82 |
|
|
|
86 |
if uploaded_file is not None:
|
87 |
# Extract text from uploaded PDF
|
88 |
pdf_arr = extract_text_from_pdf(uploaded_file)
|
89 |
+
|
90 |
st.write("PDF Uploaded Successfully.")
|
91 |
+
|
92 |
# Text area for entering a question
|
93 |
question = st.text_input("Ask a question about the PDF")
|
94 |
+
pdf_text = find_most_relevant_context(pdf_arr, question)
|
95 |
+
|
96 |
if st.button("Get Answer"):
|
97 |
if question:
|
98 |
# Get the answer from the backend
|