ARKamaliD commited on
Commit
c83b1e7
1 Parent(s): fa99cd6

adjusted to my local ollama instance

Browse files
Files changed (1) hide show
  1. app.py +33 -28
app.py CHANGED
@@ -1,46 +1,40 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfReader
3
  import pytesseract
4
  from PIL import Image
5
  import fitz
6
  import io
7
 
8
  import requests
9
- import os
10
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
 
 
 
 
14
 
15
- my_token = os.getenv('my_repo_token')
16
  def find_most_relevant_context(contexts, question, max_features=10000):
17
  # Vectorize contexts and question with limited features
18
  tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
19
  tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
20
-
21
  # Compute cosine similarity between question and contexts
22
  similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
23
-
24
  # Get index of context with highest similarity
25
  most_relevant_index = similarity_scores.argmax()
26
-
27
- return contexts[most_relevant_index]
28
-
29
-
30
-
31
-
32
 
 
33
 
34
 
 
 
35
 
36
- API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
37
- headers = {"Authorization": f"Bearer {my_token}"}
38
 
39
  def query(payload):
40
- response = requests.post(API_URL, headers=headers, json=payload)
41
- return response.json()
42
-
43
-
44
 
45
 
46
  # Mock function for answering questions from the PDF
@@ -48,30 +42,41 @@ def query(payload):
48
  def answer_question_from_pdf(pdf_text, question):
49
  # This function should return the answer to the question based on the PDF content
50
  # Here we just return a mock response
51
-
52
- return query( {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 100",})
 
 
 
 
 
 
 
 
53
 
54
  # Function to extract text from PDF
55
  def extract_text_from_pdf(pdf_file):
56
  # Open the PDF file
57
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
58
-
59
  pdf_arr = []
60
-
61
  # Iterate through each page
62
  for page_num in range(len(pdf_document)):
63
  # Get the page
64
  page = pdf_document.load_page(page_num)
65
-
66
  # Get the page as an image
67
  pix = page.get_pixmap()
68
  img = Image.open(io.BytesIO(pix.tobytes()))
69
-
70
  # Perform OCR on the image
 
71
  pdf_text = pytesseract.image_to_string(img)
72
  pdf_arr.append(pdf_text)
73
-
74
  return pdf_arr
 
 
75
  # Streamlit app
76
  st.title("PDF Explorer")
77
 
@@ -81,13 +86,13 @@ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
81
  if uploaded_file is not None:
82
  # Extract text from uploaded PDF
83
  pdf_arr = extract_text_from_pdf(uploaded_file)
84
-
85
  st.write("PDF Uploaded Successfully.")
86
-
87
  # Text area for entering a question
88
  question = st.text_input("Ask a question about the PDF")
89
- pdf_text = find_most_relevant_context(pdf_arr,question)
90
-
91
  if st.button("Get Answer"):
92
  if question:
93
  # Get the answer from the backend
 
1
  import streamlit as st
 
2
  import pytesseract
3
  from PIL import Image
4
  import fitz
5
  import io
6
 
7
  import requests
 
8
 
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.metrics.pairwise import cosine_similarity
11
 
12
+ from ollama import Client
13
+
14
+ client = Client(host='http://localhost:11434')
15
+
16
 
 
17
  def find_most_relevant_context(contexts, question, max_features=10000):
18
  # Vectorize contexts and question with limited features
19
  tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
20
  tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
21
+
22
  # Compute cosine similarity between question and contexts
23
  similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
24
+
25
  # Get index of context with highest similarity
26
  most_relevant_index = similarity_scores.argmax()
 
 
 
 
 
 
27
 
28
+ return contexts[most_relevant_index]
29
 
30
 
31
+ ollama_url = "http://localhost:11434/api/generate"
32
+ ollama_headers = {"Content-Type": "application/json"}
33
 
 
 
34
 
35
  def query(payload):
36
+ response = requests.post(ollama_url, headers=ollama_headers, json=payload)
37
+ return response.json()
 
 
38
 
39
 
40
  # Mock function for answering questions from the PDF
 
42
  def answer_question_from_pdf(pdf_text, question):
43
  # This function should return the answer to the question based on the PDF content
44
  # Here we just return a mock response
45
+
46
+ return (client.chat(
47
+ model='mixtral:8x7b',
48
+ messages=[
49
+ {
50
+ 'role': 'user',
51
+ 'content': "Based on this content: " + pdf_text + " The Question is: " + question + " Provide the answer with max lenghth of about 100",
52
+ },
53
+ ]))
54
+
55
 
56
  # Function to extract text from PDF
57
  def extract_text_from_pdf(pdf_file):
58
  # Open the PDF file
59
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
60
+
61
  pdf_arr = []
62
+
63
  # Iterate through each page
64
  for page_num in range(len(pdf_document)):
65
  # Get the page
66
  page = pdf_document.load_page(page_num)
67
+
68
  # Get the page as an image
69
  pix = page.get_pixmap()
70
  img = Image.open(io.BytesIO(pix.tobytes()))
71
+
72
  # Perform OCR on the image
73
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
74
  pdf_text = pytesseract.image_to_string(img)
75
  pdf_arr.append(pdf_text)
76
+
77
  return pdf_arr
78
+
79
+
80
  # Streamlit app
81
  st.title("PDF Explorer")
82
 
 
86
  if uploaded_file is not None:
87
  # Extract text from uploaded PDF
88
  pdf_arr = extract_text_from_pdf(uploaded_file)
89
+
90
  st.write("PDF Uploaded Successfully.")
91
+
92
  # Text area for entering a question
93
  question = st.text_input("Ask a question about the PDF")
94
+ pdf_text = find_most_relevant_context(pdf_arr, question)
95
+
96
  if st.button("Get Answer"):
97
  if question:
98
  # Get the answer from the backend