import PyPDF2 import streamlit as st from dotenv import load_dotenv from transformers import pipeline def retrieve_pdf_text(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def main(): load_dotenv() st.set_page_config(page_title='Document Summarizer', page_icon=':books:') st.header("Summarize a PDF") hf_name = "pszemraj/led-base-book-summary" pdf_file = st.file_uploader("Upload a PDF file with text", type=["pdf"]) length = st.slider('Max summary length', 0, 3000, 1000) # if a pdf file is uploaded if pdf_file: raw_text = retrieve_pdf_text(pdf_file) if st.button("Run"): with st.spinner("Summarizing.."): summarizer = pipeline("summarization", hf_name) result = summarizer( raw_text, min_length=8, max_length=length, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=3.5, num_beams=4, do_sample=False, early_stopping=True, ) st.write(result[0]["summary_text"]) if __name__ == '__main__': main()