kkg_suche / app.py
alexkueck's picture
Update app.py
fba9538 verified
raw
history blame
2.84 kB
import os
import PyPDF2
from docx import Document
from pptx import Presentation
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
DOCS_DIR = "path/to/your/documents"
def extract_text(file_path):
_, file_extension = os.path.splitext(file_path)
if file_extension.lower() == '.pdf':
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
return ' '.join(page.extract_text() for page in reader.pages)
elif file_extension.lower() == '.docx':
doc = Document(file_path)
return ' '.join(paragraph.text for paragraph in doc.paragraphs)
elif file_extension.lower() == '.pptx':
prs = Presentation(file_path)
return ' '.join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, 'text'))
elif file_extension.lower() in ['.txt', '.md']:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
else:
return ""
# Dokumente laden und Text extrahieren
documents = []
for file in os.listdir(DOCS_DIR):
file_path = os.path.join(DOCS_DIR, file)
text = extract_text(file_path)
if text:
documents.append({'file': file, 'content': text})
# TF-IDF Vektorisierung für Volltextsuche
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc['content'] for doc in documents])
def search_documents(query):
if not query:
return [doc['file'] for doc in documents]
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
return [documents[i]['file'] for i in related_docs_indices if cosine_similarities[i] > 0]
def display_document(doc_name):
file_path = os.path.join(DOCS_DIR, doc_name)
return extract_text(file_path)
def interface(query=""):
matching_docs = search_documents(query)
return gr.Dropdown(choices=matching_docs, value=matching_docs[0] if matching_docs else None, label="Gefundene Dokumente"), ""
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
with gr.Row():
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
search_button = gr.Button("Suchen")
with gr.Row():
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente")
display_button = gr.Button("Anzeigen")
doc_content = gr.Textbox(label="Dokumentinhalt", lines=20)
search_button.click(interface, inputs=[query_input], outputs=[doc_dropdown, doc_content])
display_button.click(display_document, inputs=[doc_dropdown], outputs=[doc_content])
demo.launch()