PDF-Summarizer / app.py
Tuana's picture
changing to Gradio
f3a61e0
raw
history blame
1.38 kB
import gradio as gr
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
import logging
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=3
)
def pdf_to_document_store(pdf_files):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
documents = []
for pdf in pdf_files:
documents.append(converter.convert(file_path=pdf.name, meta=None))
preprocessed_docs = preprocessor.process(documents)
document_store.write_documents(preprocessed_docs)
return None
def summarize(files):
if files is not None:
document_store.delete_all_documents()
pdf_to_document_store(files)
return document_store.get_document_count()
title = "Summarize one or more PDFs with a Haystack Summariser pipeline"
iface = gr.Interface(fn=summarize,
inputs=[gr.inputs.File(file_count="multiple", type="file", label="Upload some PDFs")],
outputs="text",
title=title,
theme="huggingface")
iface.launch()