import gradio as gr from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever import logging document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=3 ) def pdf_to_document_store(pdf_files): document_store.delete_documents() converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) documents = [] for pdf in pdf_files: documents.append(converter.convert(file_path=pdf.name, meta=None)) preprocessed_docs = preprocessor.process(documents) document_store.write_documents(preprocessed_docs) return None def summarize(files): if files is not None: document_store.delete_all_documents() pdf_to_document_store(files) return document_store.get_document_count() title = "Summarize one or more PDFs with a Haystack Summariser pipeline" iface = gr.Interface(fn=summarize, inputs=[gr.inputs.File(file_count="multiple", type="file", label="Upload some PDFs")], outputs="text", title=title, theme="huggingface") iface.launch()