Spaces:
Running
Running
ivyblossom
commited on
Commit
•
0f897d9
1
Parent(s):
7723d60
Update app.py
Browse files
app.py
CHANGED
@@ -2,9 +2,10 @@ import os
|
|
2 |
import fitz # PyMuPDF for parsing PDF
|
3 |
import streamlit as st
|
4 |
from sentence_transformers import SentenceTransformer, util
|
|
|
5 |
|
6 |
# Load a pre-trained SentenceTransformer model
|
7 |
-
model_name = "paraphrase-MiniLM-L6-v2"
|
8 |
model = SentenceTransformer(model_name)
|
9 |
|
10 |
# Function to extract text from a PDF file
|
@@ -17,8 +18,14 @@ def extract_text_from_pdf(pdf_path):
|
|
17 |
text += page_text
|
18 |
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Function to perform semantic search
|
21 |
-
def semantic_search(query, documents, top_k=5):
|
22 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
23 |
|
24 |
# Convert the list of documents to embeddings
|
@@ -30,7 +37,9 @@ def semantic_search(query, documents, top_k=5):
|
|
30 |
# Sort the results in decreasing order
|
31 |
results = []
|
32 |
for idx in range(len(cosine_scores)):
|
33 |
-
|
|
|
|
|
34 |
results = sorted(results, key=lambda x: x[2], reverse=True)
|
35 |
|
36 |
return results[:top_k]
|
|
|
2 |
import fitz # PyMuPDF for parsing PDF
|
3 |
import streamlit as st
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
+
import re
|
6 |
|
7 |
# Load a pre-trained SentenceTransformer model
|
8 |
+
model_name = "paraphrase-MiniLM-L6-v2"
|
9 |
model = SentenceTransformer(model_name)
|
10 |
|
11 |
# Function to extract text from a PDF file
|
|
|
18 |
text += page_text
|
19 |
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
|
20 |
|
21 |
+
# Function to truncate text to the nearest word boundary
|
22 |
+
def truncate_to_word_boundary(text, max_words=500):
|
23 |
+
words = re.findall(r'\w+', text)
|
24 |
+
truncated_text = ' '.join(words[:max_words])
|
25 |
+
return truncated_text
|
26 |
+
|
27 |
# Function to perform semantic search
|
28 |
+
def semantic_search(query, documents, top_k=5, max_words=500):
|
29 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
30 |
|
31 |
# Convert the list of documents to embeddings
|
|
|
37 |
# Sort the results in decreasing order
|
38 |
results = []
|
39 |
for idx in range(len(cosine_scores)):
|
40 |
+
page_num, text = documents[idx]
|
41 |
+
truncated_text = truncate_to_word_boundary(text, max_words)
|
42 |
+
results.append((page_num, truncated_text, cosine_scores[idx].item()))
|
43 |
results = sorted(results, key=lambda x: x[2], reverse=True)
|
44 |
|
45 |
return results[:top_k]
|