navid72m commited on
Commit
0d5476d
1 Parent(s): e58aa9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -3
app.py CHANGED
@@ -2,6 +2,46 @@ import streamlit as st
2
  from PyPDF2 import PdfReader
3
  import io
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # Mock function for answering questions from the PDF
6
  # Replace this with your actual backend function
7
  def answer_question_from_pdf(pdf_text, question):
@@ -13,9 +53,11 @@ def answer_question_from_pdf(pdf_text, question):
13
  def extract_text_from_pdf(pdf_file):
14
  pdf_reader = PdfReader(pdf_file)
15
  pdf_text = ""
 
16
  for page_num in range(len(pdf_reader.pages)):
17
- pdf_text += pdf_reader.pages[page_num].extract_text()
18
- return pdf_text
 
19
 
20
  # Streamlit app
21
  st.title("PDF Explorer")
@@ -25,12 +67,13 @@ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
25
 
26
  if uploaded_file is not None:
27
  # Extract text from uploaded PDF
28
- pdf_text = extract_text_from_pdf(uploaded_file)
29
 
30
  st.write("PDF Uploaded Successfully.")
31
 
32
  # Text area for entering a question
33
  question = st.text_input("Ask a question about the PDF")
 
34
 
35
  if st.button("Get Answer"):
36
  if question:
 
2
  from PyPDF2 import PdfReader
3
  import io
4
 
5
+ import requests
6
+ import os
7
+
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from datasets import load_dataset
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+ my_token = os.getenv('my_repo_token')
13
+ def find_most_relevant_context(contexts, question, max_features=10000):
14
+ # Vectorize contexts and question with limited features
15
+ tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
16
+ tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
17
+
18
+ # Compute cosine similarity between question and contexts
19
+ similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
20
+
21
+ # Get index of context with highest similarity
22
+ most_relevant_index = similarity_scores.argmax()
23
+
24
+ return contexts[most_relevant_index]
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+ API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
34
+ headers = {"Authorization": f"Bearer {my_token}"}
35
+
36
+ def query(payload):
37
+ response = requests.post(API_URL, headers=headers, json=payload)
38
+ return response.json()
39
+
40
+ output = query({
41
+ "inputs": instruction,
42
+ })
43
+
44
+
45
  # Mock function for answering questions from the PDF
46
  # Replace this with your actual backend function
47
  def answer_question_from_pdf(pdf_text, question):
 
53
  def extract_text_from_pdf(pdf_file):
54
  pdf_reader = PdfReader(pdf_file)
55
  pdf_text = ""
56
+ pdf_arr = []
57
  for page_num in range(len(pdf_reader.pages)):
58
+ pdf_text = pdf_reader.pages[page_num].extract_text()
59
+ pdf_arr.append(pdf_text)
60
+ return pdf_arr
61
 
62
  # Streamlit app
63
  st.title("PDF Explorer")
 
67
 
68
  if uploaded_file is not None:
69
  # Extract text from uploaded PDF
70
+ pdf_arr = extract_text_from_pdf(uploaded_file)
71
 
72
  st.write("PDF Uploaded Successfully.")
73
 
74
  # Text area for entering a question
75
  question = st.text_input("Ask a question about the PDF")
76
+ pdf_text = find_most_relevant_context(pdf_arr,question)
77
 
78
  if st.button("Get Answer"):
79
  if question: