Spaces:
Build error
Build error
File size: 2,888 Bytes
28ec4f0 a3fdd99 6e57c67 836e16d a3fdd99 f6cc0cb a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 67f4a7d 9a54394 a3fdd99 3a4a956 a3fdd99 3a4a956 f6cc0cb a7fa548 3a4a956 2d4dc51 67f4a7d d42a71a 67f4a7d 9a54394 9097656 fe7b517 3a4a956 a3fdd99 3a4a956 67f4a7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(file):
pdf_to_document_store(file)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
return summaries
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
set_state_if_absent("summaries", None)
document_store, summarizer, preprocessor = start_haystack()
st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the while thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results.
""", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
if uploaded_file is not None:
if st.button('Summarize Document'):
with st.spinner("π Please wait while we produce a summary..."):
try:
st. session_state.summaries = summarize(uploaded_file)
except Exception as e:
logging.exception(e)
if st.session_state.summaries:
st.write('## Summary')
for count, summary in enumerate(st.session_state.summaries):
st.write(summary.content)
|