Update app.py
Browse files
app.py
CHANGED
@@ -140,43 +140,43 @@ def search_documents(query):
|
|
140 |
similarities = cosine_similarity(query_embedding, text_embeddings).flatten()
|
141 |
|
142 |
# Sortieren nach Relevanz
|
143 |
-
|
144 |
-
results = [documents[i]['file'] for i in sorted_indices if similarities[i] > 0]
|
145 |
-
|
|
|
146 |
relevant_text = ""
|
147 |
relevant_docs = {}
|
148 |
num_pages_per_doc = [len(doc['pages']) for doc in documents]
|
149 |
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
|
150 |
|
151 |
for i in related_docs_indices:
|
152 |
-
if
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
header_content = page.get('header', '')
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
-
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
results = list(relevant_docs.keys())
|
182 |
return results, relevant_text
|
|
|
140 |
similarities = cosine_similarity(query_embedding, text_embeddings).flatten()
|
141 |
|
142 |
# Sortieren nach Relevanz
|
143 |
+
related_docs_indices = similarities.argsort()[::-1]
|
144 |
+
#results = [documents[i]['file'] for i in sorted_indices if similarities[i] > 0]
|
145 |
+
|
146 |
+
results=[]
|
147 |
relevant_text = ""
|
148 |
relevant_docs = {}
|
149 |
num_pages_per_doc = [len(doc['pages']) for doc in documents]
|
150 |
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
|
151 |
|
152 |
for i in related_docs_indices:
|
153 |
+
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
|
154 |
+
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
|
155 |
+
doc = documents[doc_index]
|
156 |
+
page = doc['pages'][page_index]
|
157 |
+
page_content = page['content']
|
158 |
+
header_content = page.get('header', '')
|
|
|
159 |
|
160 |
+
# Überprüfen, ob der Suchtext in der Überschrift oder im Seiteninhalt enthalten ist
|
161 |
+
index_in_content = page_content.lower().find(query.lower())
|
162 |
+
index_in_header = header_content.lower().find(query.lower())
|
163 |
|
164 |
+
if index_in_content != -1 or index_in_header != -1:
|
165 |
+
# Erstellen Sie einen Snippet für die Suchergebnisse
|
166 |
+
start = max(0, index_in_content - 400) if index_in_content != -1 else 0
|
167 |
+
end = min(len(page_content), index_in_content + 400) if index_in_content != -1 else len(page_content)
|
168 |
+
snippet = f"Aus <span class='doc-name'>{doc['file']}</span> (Seite <span class='page-number'>{page_index + 1}</span>):<br>"
|
169 |
|
170 |
+
# Fügen Sie die Überschrift hinzu, falls vorhanden
|
171 |
+
if header_content:
|
172 |
+
snippet += f"<b>Überschrift:</b> {header_content}<br>"
|
173 |
|
174 |
+
snippet += f"...{page_content[start:end]}...<br><br>"
|
175 |
+
relevant_text += snippet
|
176 |
|
177 |
+
if doc['file'] not in relevant_docs:
|
178 |
+
relevant_docs[doc['file']] = []
|
179 |
+
relevant_docs[doc['file']].append(snippet)
|
180 |
|
181 |
results = list(relevant_docs.keys())
|
182 |
return results, relevant_text
|