|
|
|
import os |
|
import PyPDF2 |
|
from docx import Document |
|
from pptx import Presentation |
|
import gradio as gr |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
DOCS_DIR = "path/to/your/documents" |
|
|
|
def extract_text(file_path): |
|
_, file_extension = os.path.splitext(file_path) |
|
|
|
if file_extension.lower() == '.pdf': |
|
with open(file_path, 'rb') as file: |
|
reader = PyPDF2.PdfReader(file) |
|
return ' '.join(page.extract_text() for page in reader.pages) |
|
elif file_extension.lower() == '.docx': |
|
doc = Document(file_path) |
|
return ' '.join(paragraph.text for paragraph in doc.paragraphs) |
|
elif file_extension.lower() == '.pptx': |
|
prs = Presentation(file_path) |
|
return ' '.join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, 'text')) |
|
elif file_extension.lower() in ['.txt', '.md']: |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
else: |
|
return "" |
|
|
|
|
|
documents = [] |
|
for file in os.listdir(DOCS_DIR): |
|
file_path = os.path.join(DOCS_DIR, file) |
|
text = extract_text(file_path) |
|
if text: |
|
documents.append({'file': file, 'content': text}) |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform([doc['content'] for doc in documents]) |
|
|
|
def search_documents(query): |
|
if not query: |
|
return [doc['file'] for doc in documents] |
|
|
|
query_vector = vectorizer.transform([query]) |
|
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten() |
|
related_docs_indices = cosine_similarities.argsort()[::-1] |
|
|
|
return [documents[i]['file'] for i in related_docs_indices if cosine_similarities[i] > 0] |
|
|
|
def display_document(doc_name): |
|
file_path = os.path.join(DOCS_DIR, doc_name) |
|
return extract_text(file_path) |
|
|
|
def interface(query=""): |
|
matching_docs = search_documents(query) |
|
return gr.Dropdown(choices=matching_docs, value=matching_docs[0] if matching_docs else None, label="Gefundene Dokumente"), "" |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Dokumentensuche und -anzeige") |
|
|
|
with gr.Row(): |
|
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)") |
|
search_button = gr.Button("Suchen") |
|
|
|
with gr.Row(): |
|
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente") |
|
display_button = gr.Button("Anzeigen") |
|
|
|
doc_content = gr.Textbox(label="Dokumentinhalt", lines=20) |
|
|
|
search_button.click(interface, inputs=[query_input], outputs=[doc_dropdown, doc_content]) |
|
display_button.click(display_document, inputs=[doc_dropdown], outputs=[doc_content]) |
|
|
|
demo.launch() |