Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,46 @@ import streamlit as st
|
|
2 |
from PyPDF2 import PdfReader
|
3 |
import io
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# Mock function for answering questions from the PDF
|
6 |
# Replace this with your actual backend function
|
7 |
def answer_question_from_pdf(pdf_text, question):
|
@@ -13,9 +53,11 @@ def answer_question_from_pdf(pdf_text, question):
|
|
13 |
def extract_text_from_pdf(pdf_file):
|
14 |
pdf_reader = PdfReader(pdf_file)
|
15 |
pdf_text = ""
|
|
|
16 |
for page_num in range(len(pdf_reader.pages)):
|
17 |
-
pdf_text
|
18 |
-
|
|
|
19 |
|
20 |
# Streamlit app
|
21 |
st.title("PDF Explorer")
|
@@ -25,12 +67,13 @@ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
|
|
25 |
|
26 |
if uploaded_file is not None:
|
27 |
# Extract text from uploaded PDF
|
28 |
-
|
29 |
|
30 |
st.write("PDF Uploaded Successfully.")
|
31 |
|
32 |
# Text area for entering a question
|
33 |
question = st.text_input("Ask a question about the PDF")
|
|
|
34 |
|
35 |
if st.button("Get Answer"):
|
36 |
if question:
|
|
|
2 |
from PyPDF2 import PdfReader
|
3 |
import io
|
4 |
|
5 |
+
import requests
|
6 |
+
import os
|
7 |
+
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
from datasets import load_dataset
|
11 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
12 |
+
my_token = os.getenv('my_repo_token')
|
13 |
+
def find_most_relevant_context(contexts, question, max_features=10000):
|
14 |
+
# Vectorize contexts and question with limited features
|
15 |
+
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
|
16 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
|
17 |
+
|
18 |
+
# Compute cosine similarity between question and contexts
|
19 |
+
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
|
20 |
+
|
21 |
+
# Get index of context with highest similarity
|
22 |
+
most_relevant_index = similarity_scores.argmax()
|
23 |
+
|
24 |
+
return contexts[most_relevant_index]
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
34 |
+
headers = {"Authorization": f"Bearer {my_token}"}
|
35 |
+
|
36 |
+
def query(payload):
|
37 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
38 |
+
return response.json()
|
39 |
+
|
40 |
+
output = query({
|
41 |
+
"inputs": instruction,
|
42 |
+
})
|
43 |
+
|
44 |
+
|
45 |
# Mock function for answering questions from the PDF
|
46 |
# Replace this with your actual backend function
|
47 |
def answer_question_from_pdf(pdf_text, question):
|
|
|
53 |
def extract_text_from_pdf(pdf_file):
|
54 |
pdf_reader = PdfReader(pdf_file)
|
55 |
pdf_text = ""
|
56 |
+
pdf_arr = []
|
57 |
for page_num in range(len(pdf_reader.pages)):
|
58 |
+
pdf_text = pdf_reader.pages[page_num].extract_text()
|
59 |
+
pdf_arr.append(pdf_text)
|
60 |
+
return pdf_arr
|
61 |
|
62 |
# Streamlit app
|
63 |
st.title("PDF Explorer")
|
|
|
67 |
|
68 |
if uploaded_file is not None:
|
69 |
# Extract text from uploaded PDF
|
70 |
+
pdf_arr = extract_text_from_pdf(uploaded_file)
|
71 |
|
72 |
st.write("PDF Uploaded Successfully.")
|
73 |
|
74 |
# Text area for entering a question
|
75 |
question = st.text_input("Ask a question about the PDF")
|
76 |
+
pdf_text = find_most_relevant_context(pdf_arr,question)
|
77 |
|
78 |
if st.button("Get Answer"):
|
79 |
if question:
|