alexkueck commited on
Commit
f985930
·
verified ·
1 Parent(s): face740

Create app2.py

Browse files
Files changed (1) hide show
  1. app2.py +981 -0
app2.py ADDED
@@ -0,0 +1,981 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import os
4
+ import requests
5
+ from huggingface_hub import HfApi
6
+ import base64
7
+ from io import BytesIO
8
+ import urllib.parse
9
+ import tempfile
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+
13
+
14
+ # Zugriff auf das Secret als Umgebungsvariable
15
+ HF_WRITE = os.getenv("HF_WRITE")
16
+ HF_READ = os.getenv("HF_READ")
17
+
18
+ # CONSTANTS
19
+ REPO_ID = "alexkueck/kkg_suche"
20
+ REPO_TYPE = "space"
21
+ SAVE_DIR = "kkg_dokumente"
22
+
23
+ # HfApi-Instanz erstellen
24
+ api = HfApi()
25
+
26
+
27
+ # Funktion zum Extrahieren des Textes aus einer PDF-Datei
28
+ def extract_text_from_pdf(pdf_path):
29
+ doc = fitz.open(pdf_path)
30
+ text = []
31
+ for page in doc:
32
+ text.append(page.get_text())
33
+ return text
34
+
35
+ # Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
36
+ documents = []
37
+ for file_name in os.listdir(SAVE_DIR):
38
+ if file_name.endswith(".pdf"):
39
+ pdf_path = os.path.join(SAVE_DIR, file_name)
40
+ pages_text = extract_text_from_pdf(pdf_path)
41
+ documents.append({"file": file_name, "pages": pages_text})
42
+
43
+ # TF-IDF Vectorizer vorbereiten
44
+ vectorizer = TfidfVectorizer()
45
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
46
+ ####################################################
47
+
48
+ def search_documents(query):
49
+ if not query:
50
+ return [doc['file'] for doc in documents], "", []
51
+
52
+ query_vector = vectorizer.transform([query])
53
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
54
+ related_docs_indices = cosine_similarities.argsort()[::-1]
55
+
56
+ results = []
57
+ relevant_text = ""
58
+ relevant_pdfs = []
59
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
60
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
61
+
62
+ for i in related_docs_indices:
63
+ if cosine_similarities[i] > 0:
64
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
65
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
66
+ doc = documents[doc_index]
67
+ results.append(doc['file'])
68
+ page_content = doc['pages'][page_index]
69
+ index = page_content.lower().find(query.lower())
70
+ if index != -1:
71
+ start = max(0, index - 400)
72
+ end = min(len(page_content), index + 400)
73
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
74
+ relevant_pdfs.append((doc['file'], page_index))
75
+ return results, relevant_text, relevant_pdfs
76
+
77
+
78
+ def update_display(selected_pdf):
79
+ return display_document(selected_pdf)
80
+
81
+ def update_dropdown():
82
+ return gr.Dropdown.update(choices=list_pdfs())
83
+
84
+ def search_and_update(query):
85
+ results, rel_text, relevant_pdfs = search_documents(query)
86
+
87
+ pdf_html = ""
88
+ images = []
89
+ temp_dir = tempfile.mkdtemp()
90
+
91
+ for pdf, page in relevant_pdfs:
92
+ pdf_path = os.path.join(SAVE_DIR, pdf)
93
+ document = fitz.open(pdf_path)
94
+ # Seite als Integer umwandeln
95
+ page_num = int(page)
96
+ page = document.load_page(page_num)
97
+ pix = page.get_pixmap()
98
+ img_path = os.path.join(temp_dir, f"{pdf}_page_{page.number}.png")
99
+ pix.save(img_path)
100
+ images.append(img_path)
101
+
102
+ return images, rel_text
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+ def upload_pdf(file):
111
+ if file is None:
112
+ return None, "Keine Datei hochgeladen."
113
+
114
+ # Extrahieren des Dateinamens aus dem vollen Pfad
115
+ filename = os.path.basename(file.name)
116
+
117
+ # Datei zum Hugging Face Space hochladen
118
+ upload_path = f"kkg_dokumente/{filename}"
119
+ api.upload_file(
120
+ path_or_fileobj=file.name,
121
+ path_in_repo=upload_path,
122
+ repo_id=REPO_ID,
123
+ repo_type=REPO_TYPE,
124
+ token=HF_WRITE
125
+ )
126
+ return f"PDF '{filename}' erfolgreich hochgeladen."
127
+
128
+
129
+ def list_pdfs():
130
+ if not os.path.exists(SAVE_DIR):
131
+ return []
132
+ return [f for f in os.listdir(SAVE_DIR) if f.endswith('.pdf')]
133
+
134
+ def display_pdf(selected_pdf):
135
+ pdf_path = os.path.join(SAVE_DIR, selected_pdf)
136
+
137
+ # PDF-URL im Hugging Face Space
138
+ encoded_pdf_name = urllib.parse.quote(selected_pdf)
139
+ pdf_url = f"https://huggingface.co/spaces/{REPO_ID}/resolve/main/kkg_dokumente/{encoded_pdf_name}"
140
+
141
+ # PDF von der URL herunterladen
142
+ headers = {"Authorization": f"Bearer {HF_READ}"}
143
+ response = requests.get(pdf_url, headers=headers)
144
+ if response.status_code == 200:
145
+ with open(pdf_path, 'wb') as f:
146
+ f.write(response.content)
147
+ else:
148
+ return None, f"Fehler beim Herunterladen der PDF-Datei von {pdf_url}"
149
+
150
+ # PDF in Bilder umwandeln
151
+ document = fitz.open(pdf_path)
152
+ temp_dir = tempfile.mkdtemp()
153
+
154
+ # Nur die erste Seite als Bild speichern
155
+ page = document.load_page(0)
156
+ pix = page.get_pixmap()
157
+ img_path = os.path.join(temp_dir, f"page_0.png")
158
+ pix.save(img_path)
159
+
160
+ status = f"PDF '{selected_pdf}' erfolgreich geladen und verarbeitet."
161
+
162
+ return img_path, status
163
+
164
+ ##############################################################
165
+ with gr.Blocks() as demo:
166
+ with gr.Tab("Upload PDF"):
167
+ upload_pdf_file = gr.File(label="PDF-Datei hochladen")
168
+ upload_status = gr.Textbox(label="Status")
169
+ upload_button = gr.Button("Upload")
170
+ upload_button.click(upload_pdf, inputs=upload_pdf_file, outputs=upload_status)
171
+
172
+ with gr.Tab("PDF Auswahl und Anzeige"):
173
+ pdf_dropdown = gr.Dropdown(label="Wählen Sie eine PDF-Datei", choices=list_pdfs())
174
+ query = gr.Textbox(label="Suchanfrage", type="text")
175
+ display_status = gr.Textbox(label="Status")
176
+ display_button = gr.Button("Anzeigen")
177
+
178
+ with gr.Row():
179
+ pdf_image = gr.Image(label="PDF-Seite als Bild", type="filepath")
180
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
181
+
182
+ display_button.click(display_pdf, inputs=[pdf_dropdown], outputs=[pdf_image, display_status])
183
+
184
+
185
+ with gr.Tab("Suche"):
186
+ search_query = gr.Textbox(label="Suchanfrage")
187
+ search_button = gr.Button("Suchen")
188
+
189
+ with gr.Row():
190
+ search_results = gr.Gallery(label="Relevante PDFs", type="filepath")
191
+ search_text = gr.Textbox(label="Relevanter Text", lines=10)
192
+
193
+ search_button.click(search_and_update, inputs=search_query, outputs=[search_results, search_text])
194
+
195
+ # Automatische Aktualisierung der Dropdown-Liste nach dem Hochladen einer PDF-Datei
196
+ #upload_button.click(update_dropdown, inputs=None, outputs=pdf_dropdown)
197
+ #upload_button.click(lambda: pdf_dropdown.update(choices=list_pdfs()), outputs=pdf_dropdown)
198
+
199
+ demo.launch(share=True)
200
+
201
+
202
+
203
+
204
+ """
205
+ import gradio as gr
206
+ import os
207
+ from huggingface_hub import HfApi
208
+ import time
209
+
210
+ # Zugriff auf das Secret als Umgebungsvariable
211
+ HF_TOKEN = os.getenv("HF_WRITE")
212
+
213
+ # Überprüfen, ob das Secret geladen wurde
214
+ if HF_TOKEN is None:
215
+ raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
216
+
217
+ # Repository-Name und Typ
218
+ repo_id = "alexkueck/kkg_suche"
219
+ repo_type = "space"
220
+
221
+ # HfApi-Instanz erstellen
222
+ api = HfApi()
223
+
224
+ def upload_and_display_pdf(file):
225
+ if file is None:
226
+ return None, "Keine Datei hochgeladen."
227
+
228
+ # Extrahieren des Dateinamens aus dem vollen Pfad
229
+ filename = os.path.basename(file.name)
230
+
231
+ # Datei zum Hugging Face Space hochladen
232
+ upload_path = f"kkg_dokumente/{filename}"
233
+ api.upload_file(
234
+ path_or_fileobj=file.name,
235
+ path_in_repo=upload_path,
236
+ repo_id=repo_id,
237
+ repo_type=repo_type,
238
+ token=HF_TOKEN
239
+ )
240
+
241
+ # Kurze Verzögerung, um sicherzustellen, dass die Datei verfügbar ist
242
+ time.sleep(2)
243
+
244
+ # URL zur hochgeladenen PDF-Datei erstellen
245
+ pdf_url = f"https://huggingface.co/spaces/{repo_id}/resolve/main/{upload_path}"
246
+
247
+ # HTML mit eingebettetem PDF erstellen
248
+ html_content = f
249
+ <div style="width:100%; height:600px;">
250
+ <object data="{pdf_url}" type="application/pdf" width="100%" height="100%">
251
+ <p>Es sieht so aus, als ob Ihr Browser keine eingebetteten PDFs unterstützt.
252
+ Sie können stattdessen <a href="{pdf_url}">hier klicken, um die PDF-Datei herunterzuladen</a>.</p>
253
+ </object>
254
+ </div>
255
+
256
+
257
+ return html_content, f"Datei '{filename}' erfolgreich hochgeladen und im Space gespeichert."
258
+
259
+ # Gradio Interface erstellen
260
+ iface = gr.Interface(
261
+ fn=upload_and_display_pdf,
262
+ inputs=gr.File(label="PDF-Datei hochladen"),
263
+ outputs=[
264
+ gr.HTML(label="PDF-Anzeige"),
265
+ gr.Textbox(label="Status")
266
+ ],
267
+ title="PDF Upload und Anzeige",
268
+ description="Laden Sie eine PDF-Datei hoch. Sie wird im 'kkg_dokumente' Ordner des Spaces gespeichert und hier angezeigt."
269
+ )
270
+
271
+ # App starten
272
+ iface.launch()
273
+ """
274
+
275
+
276
+
277
+ #funktionierenden upload
278
+ """
279
+ import gradio as gr
280
+ import os
281
+ import fitz # PyMuPDF
282
+ import tempfile
283
+ from huggingface_hub import HfApi
284
+ import shutil
285
+
286
+ # Zugriff auf das Secret als Umgebungsvariable
287
+ HF_TOKEN = os.getenv("HF_WRITE")
288
+
289
+ # Überprüfen, ob das Secret geladen wurde
290
+ if HF_TOKEN is None:
291
+ raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
292
+
293
+ # Repository-Name
294
+ repo_id = "alexkueck/kkg_suche"
295
+ repo_type = "space"
296
+
297
+ # HfApi-Instanz erstellen
298
+ api = HfApi()
299
+
300
+
301
+
302
+ def upload_and_display_pdf(file):
303
+ if file is None:
304
+ return None, "Keine Datei hochgeladen."
305
+
306
+ # Extrahieren des Dateinamens aus dem vollen Pfad
307
+ filename = os.path.basename(file.name)
308
+
309
+ # Datei zum Hugging Face Space hochladen
310
+ upload_path = f"kkg_dokumente/{filename}"
311
+ api.upload_file(
312
+ path_or_fileobj=file.name,
313
+ path_in_repo=upload_path,
314
+ repo_id=repo_id,
315
+ repo_type=repo_type,
316
+ token=HF_TOKEN
317
+ )
318
+
319
+ # PDF in HTML umwandeln
320
+ doc = fitz.open(file.name)
321
+ html_content = ""
322
+ for page in doc:
323
+ html_content += page.get_text("html")
324
+ doc.close()
325
+
326
+ # Temporäre HTML-Datei erstellen
327
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w", encoding="utf-8") as temp_file:
328
+ temp_file.write(html_content)
329
+ temp_html_path = temp_file.name
330
+
331
+ return temp_html_path, f"Datei '{filename}' erfolgreich hochgeladen und im Repository gespeichert."
332
+
333
+ # Gradio Interface erstellen
334
+ iface = gr.Interface(
335
+ fn=upload_and_display_pdf,
336
+ inputs=gr.File(label="PDF-Datei hochladen"),
337
+ outputs=[
338
+ gr.HTML(label="PDF-Inhalt"),
339
+ gr.Textbox(label="Status")
340
+ ],
341
+ title="PDF Upload und Anzeige",
342
+ description="Laden Sie eine PDF-Datei hoch. Sie wird im 'kkg_dokumente' Ordner des Repositories gespeichert und hier angezeigt."
343
+ )
344
+
345
+ # App starten
346
+ iface.launch()
347
+ """
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+ """
357
+ # Zugriff auf das Secret als Umgebungsvariable
358
+ HF_TOKEN = os.getenv("HF_WRITE")
359
+
360
+ # Überprüfen, ob das Secret geladen wurde
361
+ if HF_TOKEN is None:
362
+ raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
363
+
364
+ # Repository-Name
365
+ repo_id = "alexkueck/kkg_suche"
366
+
367
+ # Absoluter Pfad zum Verzeichnis mit den Dokumenten
368
+ DOCS_DIR = "kkg_dokumente"
369
+
370
+ # Funktion zum Extrahieren des Textes aus einer PDF-Datei
371
+ def extract_text_from_pdf(pdf_path):
372
+ doc = fitz.open(pdf_path)
373
+ text = []
374
+ for page in doc:
375
+ text.append(page.get_text())
376
+ return text
377
+
378
+ # Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
379
+ documents = []
380
+ for file_name in os.listdir(DOCS_DIR):
381
+ if file_name.endswith(".pdf"):
382
+ pdf_path = os.path.join(DOCS_DIR, file_name)
383
+ pages_text = extract_text_from_pdf(pdf_path)
384
+ documents.append({"file": file_name, "pages": pages_text})
385
+
386
+ # TF-IDF Vectorizer vorbereiten
387
+ vectorizer = TfidfVectorizer()
388
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
389
+
390
+ def display_document(doc_name):
391
+ if isinstance(doc_name, list):
392
+ doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
393
+
394
+ file_path = os.path.join(DOCS_DIR, doc_name)
395
+
396
+ if not os.path.exists(file_path):
397
+ return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
398
+
399
+ # Generieren Sie die URL für das PDF
400
+ file_url = f"file://{file_path}"
401
+
402
+ return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
403
+
404
+ def search_documents(query):
405
+ if not query:
406
+ return [doc['file'] for doc in documents], "", []
407
+
408
+ query_vector = vectorizer.transform([query])
409
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
410
+ related_docs_indices = cosine_similarities.argsort()[::-1]
411
+
412
+ results = []
413
+ relevant_text = ""
414
+ relevant_pdfs = []
415
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
416
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
417
+
418
+ for i in related_docs_indices:
419
+ if cosine_similarities[i] > 0:
420
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
421
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
422
+ doc = documents[doc_index]
423
+ results.append(doc['file'])
424
+ page_content = doc['pages'][page_index]
425
+ index = page_content.lower().find(query.lower())
426
+ if index != -1:
427
+ start = max(0, index - 100)
428
+ end = min(len(page_content), index + 100)
429
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
430
+ relevant_pdfs.append((doc['file'], page_index))
431
+
432
+ return results, relevant_text, relevant_pdfs
433
+
434
+ def update_display(doc_name):
435
+ return display_document(doc_name)
436
+
437
+ def search_and_update(query):
438
+ results, rel_text, relevant_pdfs = search_documents(query)
439
+
440
+ pdf_html = ""
441
+ for pdf, page in relevant_pdfs:
442
+ pdf_path = os.path.join(DOCS_DIR, pdf)
443
+
444
+ if not os.path.exists(pdf_path):
445
+ pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
446
+ else:
447
+ file_url = f"file://{pdf_path}"
448
+ pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
449
+ pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
450
+
451
+ return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
452
+
453
+ def upload_file(file):
454
+ local_file_path = file.name
455
+ target_path_in_space = f"kkg_dokumente/{file.orig_name}"
456
+
457
+ api = HfApi()
458
+ api.upload_file(
459
+ path_or_fileobj=local_file_path,
460
+ path_in_repo=target_path_in_space,
461
+ repo_id=repo_id,
462
+ token=HF_TOKEN,
463
+ repo_type="space"
464
+ )
465
+
466
+ return file.name
467
+
468
+ # Initialisieren der Gradio-Oberfläche
469
+ with gr.Blocks() as demo:
470
+ gr.Markdown("# Dokumentensuche und -anzeige")
471
+
472
+ query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
473
+ file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="file")
474
+
475
+ with gr.Row():
476
+ with gr.Column(scale=2):
477
+ doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
478
+ doc_display = gr.HTML(label="Dokumentvorschau")
479
+ with gr.Column(scale=1):
480
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
481
+ pdf_display = gr.HTML()
482
+
483
+ query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
484
+ doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
485
+ file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
486
+
487
+ demo.launch()
488
+ """
489
+
490
+
491
+
492
+
493
+ """
494
+
495
+ import gradio as gr
496
+ import os
497
+ import fitz # PyMuPDF
498
+ from sklearn.feature_extraction.text import TfidfVectorizer
499
+ from sklearn.metrics.pairwise import cosine_similarity
500
+
501
+ # Absoluter Pfad zum Verzeichnis mit den Dokumenten
502
+ DOCS_DIR = os.path.abspath("kkg_dokumente")
503
+
504
+ # Funktion zum Extrahieren des Textes aus einer PDF-Datei
505
+ def extract_text_from_pdf(pdf_path):
506
+ doc = fitz.open(pdf_path)
507
+ text = []
508
+ for page in doc:
509
+ text.append(page.get_text())
510
+ return text
511
+
512
+ # Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
513
+ documents = []
514
+ for file_name in os.listdir(DOCS_DIR):
515
+ if file_name.endswith(".pdf"):
516
+ pdf_path = os.path.join(DOCS_DIR, file_name)
517
+ pages_text = extract_text_from_pdf(pdf_path)
518
+ documents.append({"file": file_name, "pages": pages_text})
519
+
520
+ # TF-IDF Vectorizer vorbereiten
521
+ vectorizer = TfidfVectorizer()
522
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
523
+
524
+ def display_document(doc_name):
525
+ if isinstance(doc_name, list):
526
+ doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
527
+
528
+ file_path = os.path.join(DOCS_DIR, doc_name)
529
+
530
+ if not os.path.exists(file_path):
531
+ return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
532
+
533
+ # Generieren Sie die URL für das PDF
534
+ file_url = f"file://{file_path}"
535
+
536
+ return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
537
+
538
+ def search_documents(query):
539
+ if not query:
540
+ return [doc['file'] for doc in documents], "", []
541
+
542
+ query_vector = vectorizer.transform([query])
543
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
544
+ related_docs_indices = cosine_similarities.argsort()[::-1]
545
+
546
+ results = []
547
+ relevant_text = ""
548
+ relevant_pdfs = []
549
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
550
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
551
+
552
+ for i in related_docs_indices:
553
+ if cosine_similarities[i] > 0:
554
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
555
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
556
+ doc = documents[doc_index]
557
+ results.append(doc['file'])
558
+ page_content = doc['pages'][page_index]
559
+ index = page_content.lower().find(query.lower())
560
+ if index != -1:
561
+ start = max(0, index - 100)
562
+ end = min(len(page_content), index + 100)
563
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
564
+ relevant_pdfs.append((doc['file'], page_index))
565
+
566
+ return results, relevant_text, relevant_pdfs
567
+
568
+ def update_display(doc_name):
569
+ return display_document(doc_name)
570
+
571
+ def search_and_update(query):
572
+ results, rel_text, relevant_pdfs = search_documents(query)
573
+
574
+ pdf_html = ""
575
+ for pdf, page in relevant_pdfs:
576
+ pdf_path = os.path.join(DOCS_DIR, pdf)
577
+
578
+ if not os.path.exists(pdf_path):
579
+ pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
580
+ else:
581
+ file_url = f"file://{pdf_path}"
582
+ pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
583
+ pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
584
+
585
+ return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
586
+
587
+ def upload_file(file):
588
+ file_name = "uploaded_file.pdf"
589
+ file_path = os.path.join(DOCS_DIR, file_name)
590
+
591
+ # Debugging-Ausgabe: Überprüfen Sie, ob das Verzeichnis existiert
592
+ if not os.path.exists(DOCS_DIR):
593
+ print(f"Verzeichnis {DOCS_DIR} existiert nicht. Erstelle Verzeichnis.")
594
+ os.makedirs(DOCS_DIR)
595
+
596
+ # Debugging-Ausgabe: Dateiname und Pfad
597
+ print(f"Speichere Datei nach {file_path}")
598
+
599
+ with open(file_path, "wb") as f:
600
+ f.write(file)
601
+
602
+ # Überprüfen, ob die Datei korrekt gespeichert wurde
603
+ if os.path.exists(file_path):
604
+ print(f"Datei erfolgreich gespeichert: {file_path}")
605
+ else:
606
+ print(f"Fehler beim Speichern der Datei: {file_path}")
607
+
608
+ # Aktualisieren Sie die Dokumentenliste und die TF-IDF-Matrix
609
+ pages_text = extract_text_from_pdf(file_path)
610
+ documents.append({"file": file_name, "pages": pages_text})
611
+
612
+ global tfidf_matrix
613
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
614
+
615
+ return gr.update(choices=[doc['file'] for doc in documents], value=file_name)
616
+
617
+ # Initialisieren der Gradio-Oberfläche
618
+ with gr.Blocks() as demo:
619
+ gr.Markdown("# Dokumentensuche und -anzeige")
620
+
621
+ query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
622
+ file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="binary")
623
+
624
+ with gr.Row():
625
+ with gr.Column(scale=2):
626
+ doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
627
+ doc_display = gr.HTML(label="Dokumentvorschau")
628
+ with gr.Column(scale=1):
629
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
630
+ pdf_display = gr.HTML()
631
+
632
+ query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
633
+ doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
634
+ file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
635
+
636
+ demo.launch()
637
+
638
+ """
639
+
640
+
641
+
642
+
643
+
644
+
645
+ """
646
+ import gradio as gr
647
+ import os
648
+ import fitz # PyMuPDF
649
+ from sklearn.feature_extraction.text import TfidfVectorizer
650
+ from sklearn.metrics.pairwise import cosine_similarity
651
+
652
+ # Absoluter Pfad zum Verzeichnis mit den Dokumenten
653
+ DOCS_DIR = os.path.abspath("kkg_dokumente")
654
+
655
+ # Funktion zum Extrahieren des Textes aus einer PDF-Datei
656
+ def extract_text_from_pdf(pdf_path):
657
+ doc = fitz.open(pdf_path)
658
+ text = []
659
+ for page in doc:
660
+ text.append(page.get_text())
661
+ return text
662
+
663
+ # Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
664
+ documents = []
665
+ for file_name in os.listdir(DOCS_DIR):
666
+ if file_name.endswith(".pdf"):
667
+ pdf_path = os.path.join(DOCS_DIR, file_name)
668
+ pages_text = extract_text_from_pdf(pdf_path)
669
+ documents.append({"file": file_name, "pages": pages_text})
670
+
671
+ # TF-IDF Vectorizer vorbereiten
672
+ vectorizer = TfidfVectorizer()
673
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
674
+
675
+ def display_document(doc_name):
676
+ if isinstance(doc_name, list):
677
+ doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
678
+
679
+ file_path = os.path.join(DOCS_DIR, doc_name)
680
+
681
+ if not os.path.exists(file_path):
682
+ return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
683
+
684
+ # Generieren Sie die URL für das PDF
685
+ file_url = f"file://{file_path}"
686
+
687
+ return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
688
+
689
+ def search_documents(query):
690
+ if not query:
691
+ return [doc['file'] for doc in documents], "", []
692
+
693
+ query_vector = vectorizer.transform([query])
694
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
695
+ related_docs_indices = cosine_similarities.argsort()[::-1]
696
+
697
+ results = []
698
+ relevant_text = ""
699
+ relevant_pdfs = []
700
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
701
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
702
+
703
+ for i in related_docs_indices:
704
+ if cosine_similarities[i] > 0:
705
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
706
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
707
+ doc = documents[doc_index]
708
+ results.append(doc['file'])
709
+ page_content = doc['pages'][page_index]
710
+ index = page_content.lower().find(query.lower())
711
+ if index != -1:
712
+ start = max(0, index - 100)
713
+ end = min(len(page_content), index + 100)
714
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
715
+ relevant_pdfs.append((doc['file'], page_index))
716
+
717
+ return results, relevant_text, relevant_pdfs
718
+
719
+ def update_display(doc_name):
720
+ return display_document(doc_name)
721
+
722
+ def search_and_update(query):
723
+ results, rel_text, relevant_pdfs = search_documents(query)
724
+
725
+ pdf_html = ""
726
+ for pdf, page in relevant_pdfs:
727
+ pdf_path = os.path.join(DOCS_DIR, pdf)
728
+
729
+ if not os.path.exists(pdf_path):
730
+ pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
731
+ else:
732
+ file_url = f"file://{pdf_path}"
733
+ pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
734
+ pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
735
+
736
+ return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
737
+
738
+ def upload_file(file):
739
+ file_path = os.path.join(DOCS_DIR, file.name)
740
+ with open(file_path, "wb") as f:
741
+ f.write(file.read())
742
+
743
+ # Aktualisieren Sie die Dokumentenliste und die TF-IDF-Matrix
744
+ pages_text = extract_text_from_pdf(file_path)
745
+ documents.append({"file": file.name, "pages": pages_text})
746
+
747
+ global tfidf_matrix
748
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
749
+
750
+ return gr.update(choices=[doc['file'] for doc in documents], value=file.name)
751
+
752
+ # Initialisieren der Gradio-Oberfläche
753
+ with gr.Blocks() as demo:
754
+ gr.Markdown("# Dokumentensuche und -anzeige")
755
+
756
+ query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
757
+ file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="binary")
758
+
759
+ with gr.Row():
760
+ with gr.Column(scale=2):
761
+ doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
762
+ doc_display = gr.HTML(label="Dokumentvorschau")
763
+ with gr.Column(scale=1):
764
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
765
+ pdf_display = gr.HTML()
766
+
767
+ query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
768
+ doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
769
+ file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
770
+
771
+ demo.launch()
772
+ """
773
+
774
+
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+ ###funktioniert......................................
784
+ """
785
+ import gradio as gr
786
+ from sklearn.feature_extraction.text import TfidfVectorizer
787
+ from sklearn.metrics.pairwise import cosine_similarity
788
+
789
+ # Beispiel-Daten mit hartcodierten Texten
790
+ documents = [
791
+ {"file": "document1.pdf", "pages": ["Seite 1 Inhalt von Dokument 1", "Seite 2 Inhalt von Dokument 1"]},
792
+ {"file": "document2.pdf", "pages": ["Seite 1 Inhalt von Dokument 2", "Seite 2 Inhalt von Dokument 2"]}
793
+ ]
794
+
795
+ # TF-IDF Vectorizer vorbereiten
796
+ vectorizer = TfidfVectorizer()
797
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
798
+
799
+ def display_document(doc_name):
800
+ # Hartcodierter HTML-Inhalt zur Anzeige des Dokuments
801
+ hardcoded_html = f
802
+ <h1>{doc_name}</h1>
803
+ <p>Dies ist ein Beispieltext für die Anzeige des Dokuments {doc_name}.</p>
804
+ <iframe src="https://www.example.com" width="100%" height="600px"></iframe>
805
+
806
+ return hardcoded_html
807
+
808
+ def search_documents(query):
809
+ if not query:
810
+ return [doc['file'] for doc in documents], "", []
811
+
812
+ query_vector = vectorizer.transform([query])
813
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
814
+ related_docs_indices = cosine_similarities.argsort()[::-1]
815
+
816
+ results = []
817
+ relevant_text = ""
818
+ relevant_pdfs = []
819
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
820
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
821
+
822
+ for i in related_docs_indices:
823
+ if cosine_similarities[i] > 0:
824
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
825
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
826
+ doc = documents[doc_index]
827
+ results.append(doc['file'])
828
+ page_content = doc['pages'][page_index]
829
+ index = page_content.lower().find(query.lower())
830
+ if index != -1:
831
+ start = max(0, index - 100)
832
+ end = min(len(page_content), index + 100)
833
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
834
+ relevant_pdfs.append((doc['file'], page_index))
835
+
836
+ return results, relevant_text, relevant_pdfs
837
+
838
+ def update_display(doc_name):
839
+ return display_document(doc_name)
840
+
841
+ def search_and_update(query):
842
+ results, rel_text, relevant_pdfs = search_documents(query)
843
+
844
+ pdf_html = ""
845
+ for pdf, page in relevant_pdfs:
846
+ # Hartcodierter HTML-Inhalt zur Anzeige der Suchergebnisse
847
+ pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
848
+ pdf_html += f'<iframe src="https://www.example.com" width="100%" height="600px"></iframe>'
849
+
850
+ return results, rel_text, pdf_html
851
+
852
+ # Initialisieren der Gradio-Oberfläche
853
+ with gr.Blocks() as demo:
854
+ gr.Markdown("# Dokumentensuche und -anzeige")
855
+
856
+ query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
857
+
858
+ with gr.Row():
859
+ with gr.Column(scale=2):
860
+ doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente")
861
+ doc_display = gr.HTML(label="Dokumentvorschau")
862
+ with gr.Column(scale=1):
863
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
864
+ pdf_display = gr.HTML()
865
+
866
+ query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
867
+ doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
868
+
869
+ demo.launch()
870
+ """
871
+
872
+
873
+ """
874
+ import gradio as gr
875
+ import os
876
+ import fitz # PyMuPDF
877
+ from sklearn.feature_extraction.text import TfidfVectorizer
878
+ from sklearn.metrics.pairwise import cosine_similarity
879
+
880
+ # Verwenden Sie den korrekten Pfad für die hochgeladenen Dateien in Ihrem Hugging Face Space
881
+ DOCS_DIR = os.path.abspath("kkg_dokumente")
882
+
883
+ # Funktion zum Extrahieren des Textes aus einer PDF-Datei
884
+ def extract_text_from_pdf(pdf_path):
885
+ doc = fitz.open(pdf_path)
886
+ text = []
887
+ for page in doc:
888
+ text.append(page.get_text())
889
+ return text
890
+
891
+ # Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
892
+ documents = []
893
+ for file_name in os.listdir(DOCS_DIR):
894
+ if file_name.endswith(".pdf"):
895
+ pdf_path = os.path.join(DOCS_DIR, file_name)
896
+ pages_text = extract_text_from_pdf(pdf_path)
897
+ documents.append({"file": file_name, "pages": pages_text})
898
+
899
+ # TF-IDF Vectorizer vorbereiten
900
+ vectorizer = TfidfVectorizer()
901
+ tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
902
+
903
+ def display_document(doc_name):
904
+ file_path = os.path.join(DOCS_DIR, doc_name)
905
+
906
+ if not os.path.exists(file_path):
907
+ return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
908
+
909
+ # Generieren Sie die URL für das PDF
910
+ file_url = f"{DOCS_DIR}/{doc_name}"
911
+
912
+ return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
913
+
914
+ def search_documents(query):
915
+ if not query:
916
+ return [doc['file'] for doc in documents], "", []
917
+
918
+ query_vector = vectorizer.transform([query])
919
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
920
+ related_docs_indices = cosine_similarities.argsort()[::-1]
921
+
922
+ results = []
923
+ relevant_text = ""
924
+ relevant_pdfs = []
925
+ num_pages_per_doc = [len(doc['pages']) for doc in documents]
926
+ cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
927
+
928
+ for i in related_docs_indices:
929
+ if cosine_similarities[i] > 0:
930
+ doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
931
+ page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
932
+ doc = documents[doc_index]
933
+ results.append(doc['file'])
934
+ page_content = doc['pages'][page_index]
935
+ index = page_content.lower().find(query.lower())
936
+ if index != -1:
937
+ start = max(0, index - 100)
938
+ end = min(len(page_content), index + 100)
939
+ relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
940
+ relevant_pdfs.append((doc['file'], page_index))
941
+
942
+ return results, relevant_text, relevant_pdfs
943
+
944
+ def update_display(doc_name):
945
+ return display_document(doc_name)
946
+
947
+ def search_and_update(query):
948
+ results, rel_text, relevant_pdfs = search_documents(query)
949
+
950
+ pdf_html = ""
951
+ for pdf, page in relevant_pdfs:
952
+ pdf_path = os.path.join(DOCS_DIR, pdf)
953
+
954
+ if not os.path.exists(pdf_path):
955
+ pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
956
+ else:
957
+ file_url = f"{DOCS_DIR}/{pdf}"
958
+ pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
959
+ pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
960
+
961
+ return gr.Dropdown.update(choices=results), rel_text, pdf_html
962
+
963
+ # Initialisieren der Gradio-Oberfläche
964
+ with gr.Blocks() as demo:
965
+ gr.Markdown("# Dokumentensuche und -anzeige")
966
+
967
+ query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
968
+
969
+ with gr.Row():
970
+ with gr.Column(scale=2):
971
+ doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente")
972
+ doc_display = gr.HTML(label="Dokumentvorschau")
973
+ with gr.Column(scale=1):
974
+ relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
975
+ pdf_display = gr.HTML()
976
+
977
+ query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
978
+ doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
979
+
980
+ demo.launch()
981
+ """