alexkueck commited on
Commit
670c350
·
verified ·
1 Parent(s): 74ac068

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -70
app.py CHANGED
@@ -214,82 +214,82 @@ def search_documents(query):
214
  all_texts.append(preprocessed_text)
215
 
216
  if not all_texts:
217
- raise ValueError("Keine gültigen Texte nach der Präprozessierung vorhanden.")
 
 
 
 
 
 
 
 
218
 
219
- #und nun entsprechend auch die Query überarbeiten
220
- prepro_query = preprocess_text(query)
221
-
222
- # TF-IDF-Vektorisierung
223
- vectorizer = TfidfVectorizer()
224
- text_tfidf = vectorizer.fit_transform(all_texts)
225
- query_tfidf = vectorizer.transform([prepro_query])
226
 
227
- # Berechnung der Ähnlichkeit
228
- similarities = cosine_similarity(query_tfidf, text_tfidf).flatten()
229
-
230
- # Sortieren nach Relevanz
231
- related_docs_indices = similarities.argsort()[::-1]
232
-
233
- results = []
234
- relevant_text = ""
235
- relevant_docs = {}
236
- num_pages_per_doc = [len(doc['pages']) for doc in documents]
237
- cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
238
-
239
- for i in related_docs_indices:
240
- if similarities[i] > 0.3:
241
- doc_index = None
242
- for idx, cumulative in enumerate(cumulative_pages):
243
- if i < cumulative:
244
- doc_index = idx
245
- break
246
- if doc_index is None:
247
- continue
248
 
249
- page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
250
- doc = documents[doc_index]
251
- page = doc['pages'][page_index]
252
- page_content = page['content']
253
- header_content = page.get('header', '')
254
-
255
- # Überprüfen, ob der Suchtext in der Überschrift oder im Seiteninhalt enthalten ist
256
- index_in_content = page_content.lower().find(prepro_query.lower())
257
- index_in_header = header_content.lower().find(prepro_query.lower())
258
-
259
- # Berücksichtigung der Levenshtein-Distanz
260
- # Berücksichtigung der Levenshtein-Distanz
261
- words_in_query = prepro_query.split()
262
- page_words = preprocess_text(page_content).split()
263
- header_words = preprocess_text(header_content).split()
264
-
265
- if (index_in_content != -1 or index_in_header != -1 or
266
- any(fuzz.ratio(word, page_word) > 80 for word in words_in_query for page_word in page_words) or
267
- any(fuzz.ratio(word, header_word) > 80 for word in words_in_query for header_word in header_words)):
268
-
269
-
270
- # Erstellen Sie einen Snippet für die Suchergebnisse
271
- start = max(0, index_in_content - 400) if index_in_content != -1 else 0
272
- end = min(len(page_content), index_in_content + 400) if index_in_content != -1 else len(page_content)
273
- snippet = f"Aus <span class='doc-name'>{doc['file']}</span> (Seite <span class='page-number'>{page_index + 1}</span>):<br>"
274
-
275
- # Fügen Sie die Überschrift hinzu, falls vorhanden
276
- if header_content:
277
- snippet += f"<span style='color: #0EDC0E; font-weight: bold;'>Überschrift: {header_content}</span> <br>"
278
- snippet += f"{remove_line_breaks(page_content[start:end])}<br><hr>"
 
279
 
280
- relevant_text += snippet
281
-
282
- if doc['file'] not in relevant_docs:
283
- relevant_docs[doc['file']] = []
284
- relevant_docs[doc['file']].append(snippet)
285
-
286
- # Sortieren nach Relevanz
287
- results = sorted(results, key=lambda x: x[1], reverse=True)
288
- results = [res[0] for res in results]
 
 
 
 
 
 
 
289
 
290
- results = list(relevant_docs.keys())
291
- return results, relevant_text
 
292
 
 
 
 
 
 
 
 
293
 
294
 
295
 
 
214
  all_texts.append(preprocessed_text)
215
 
216
  if not all_texts:
217
+ return "", ""
218
+ else:
219
+ #und nun entsprechend auch die Query überarbeiten
220
+ prepro_query = preprocess_text(query)
221
+
222
+ # TF-IDF-Vektorisierung
223
+ vectorizer = TfidfVectorizer()
224
+ text_tfidf = vectorizer.fit_transform(all_texts)
225
+ query_tfidf = vectorizer.transform([prepro_query])
226
 
227
+ # Berechnung der Ähnlichkeit
228
+ similarities = cosine_similarity(query_tfidf, text_tfidf).flatten()
 
 
 
 
 
229
 
230
+ # Sortieren nach Relevanz
231
+ related_docs_indices = similarities.argsort()[::-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ results = []
234
+ relevant_text = ""
235
+ relevant_docs = {}
236
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
237
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
238
+
239
+ for i in related_docs_indices:
240
+ if similarities[i] > 0.3:
241
+ doc_index = None
242
+ for idx, cumulative in enumerate(cumulative_pages):
243
+ if i < cumulative:
244
+ doc_index = idx
245
+ break
246
+ if doc_index is None:
247
+ continue
248
+
249
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
250
+ doc = documents[doc_index]
251
+ page = doc['pages'][page_index]
252
+ page_content = page['content']
253
+ header_content = page.get('header', '')
254
+
255
+ # Überprüfen, ob der Suchtext in der Überschrift oder im Seiteninhalt enthalten ist
256
+ index_in_content = page_content.lower().find(prepro_query.lower())
257
+ index_in_header = header_content.lower().find(prepro_query.lower())
258
+
259
+ # Berücksichtigung der Levenshtein-Distanz
260
+ # Berücksichtigung der Levenshtein-Distanz
261
+ words_in_query = prepro_query.split()
262
+ page_words = preprocess_text(page_content).split()
263
+ header_words = preprocess_text(header_content).split()
264
 
265
+ if (index_in_content != -1 or index_in_header != -1 or
266
+ any(fuzz.ratio(word, page_word) > 80 for word in words_in_query for page_word in page_words) or
267
+ any(fuzz.ratio(word, header_word) > 80 for word in words_in_query for header_word in header_words)):
268
+
269
+
270
+ # Erstellen Sie einen Snippet für die Suchergebnisse
271
+ start = max(0, index_in_content - 400) if index_in_content != -1 else 0
272
+ end = min(len(page_content), index_in_content + 400) if index_in_content != -1 else len(page_content)
273
+ snippet = f"Aus <span class='doc-name'>{doc['file']}</span> (Seite <span class='page-number'>{page_index + 1}</span>):<br>"
274
+
275
+ # Fügen Sie die Überschrift hinzu, falls vorhanden
276
+ if header_content:
277
+ snippet += f"<span style='color: #0EDC0E; font-weight: bold;'>Überschrift: {header_content}</span> <br>"
278
+ snippet += f"{remove_line_breaks(page_content[start:end])}<br><hr>"
279
+
280
+ relevant_text += snippet
281
 
282
+ if doc['file'] not in relevant_docs:
283
+ relevant_docs[doc['file']] = []
284
+ relevant_docs[doc['file']].append(snippet)
285
 
286
+ # Sortieren nach Relevanz
287
+ results = sorted(results, key=lambda x: x[1], reverse=True)
288
+ results = [res[0] for res in results]
289
+
290
+ results = list(relevant_docs.keys())
291
+ return results, relevant_text
292
+
293
 
294
 
295