ivyblossom commited on
Commit
0f897d9
1 Parent(s): 7723d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -3
app.py CHANGED
@@ -2,9 +2,10 @@ import os
2
  import fitz # PyMuPDF for parsing PDF
3
  import streamlit as st
4
  from sentence_transformers import SentenceTransformer, util
 
5
 
6
  # Load a pre-trained SentenceTransformer model
7
- model_name = "paraphrase-MiniLM-L6-v2"
8
  model = SentenceTransformer(model_name)
9
 
10
  # Function to extract text from a PDF file
@@ -17,8 +18,14 @@ def extract_text_from_pdf(pdf_path):
17
  text += page_text
18
  yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
19
 
 
 
 
 
 
 
20
  # Function to perform semantic search
21
- def semantic_search(query, documents, top_k=5):
22
  query_embedding = model.encode(query, convert_to_tensor=True)
23
 
24
  # Convert the list of documents to embeddings
@@ -30,7 +37,9 @@ def semantic_search(query, documents, top_k=5):
30
  # Sort the results in decreasing order
31
  results = []
32
  for idx in range(len(cosine_scores)):
33
- results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
 
 
34
  results = sorted(results, key=lambda x: x[2], reverse=True)
35
 
36
  return results[:top_k]
 
2
  import fitz # PyMuPDF for parsing PDF
3
  import streamlit as st
4
  from sentence_transformers import SentenceTransformer, util
5
+ import re
6
 
7
  # Load a pre-trained SentenceTransformer model
8
+ model_name = "paraphrase-MiniLM-L6-v2"
9
  model = SentenceTransformer(model_name)
10
 
11
  # Function to extract text from a PDF file
 
18
  text += page_text
19
  yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
20
 
21
+ # Function to truncate text to the nearest word boundary
22
+ def truncate_to_word_boundary(text, max_words=500):
23
+ words = re.findall(r'\w+', text)
24
+ truncated_text = ' '.join(words[:max_words])
25
+ return truncated_text
26
+
27
  # Function to perform semantic search
28
+ def semantic_search(query, documents, top_k=5, max_words=500):
29
  query_embedding = model.encode(query, convert_to_tensor=True)
30
 
31
  # Convert the list of documents to embeddings
 
37
  # Sort the results in decreasing order
38
  results = []
39
  for idx in range(len(cosine_scores)):
40
+ page_num, text = documents[idx]
41
+ truncated_text = truncate_to_word_boundary(text, max_words)
42
+ results.append((page_num, truncated_text, cosine_scores[idx].item()))
43
  results = sorted(results, key=lambda x: x[2], reverse=True)
44
 
45
  return results[:top_k]