Elrmnd's picture
Update app.py
dda7485
raw
history blame
3.69 kB
# https://elrmnd-vocal-pdf-summarizer.hf.space
# Import libraries
import gradio as gr
import PyPDF2
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from io import BytesIO
# Function to extract text from PDF
# Defines a function to extract raw text from a PDF file
def extract_text(pdf_file):
pdfReader = PyPDF2.PdfReader(pdf_file)
pageObj = pdfReader.pages[0]
return pageObj.extract_text()
# Function to summarize text
# Defines a function to summarize the extracted text using facebook/bart-large-cnn
def summarize_text(text):
sentences = text.split(". ")
start = -1 # Default value if "Abstract" is not found
end = -1
for i, sentence in enumerate(sentences):
if "Abstract" in sentence:
start = i + 1
end = start + 6
break
if start != -1:
abstract = ". ".join(sentences[start:end + 1])
# Load BART model & tokenizer
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
# Tokenize abstract
inputs = tokenizer(abstract,
max_length=1024,
return_tensors="pt",
truncation=True)
# Generate summary
summary_ids = model.generate(inputs['input_ids'],
max_length=50,
min_length=30,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=3.5,
num_beams=4,
do_sample=True,
early_stopping=False)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
if '.' in summary:
index = summary.rindex('.')
if index != -1:
summary = summary[:index + 1]
else:
summary = "Abstract not found in the document."
return summary
# Function to convert text to audio
# Defines a function to convert text to an audio file using Google Text-to-Speech
def text_to_audio(text):
tts = gTTS(text, lang='en')
buffer = BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
return buffer.read()
### Main function
### The main function that ties everything together:
### extracts text, summarizes, and converts to audio.
def audio_pdf(pdf_file):
text = extract_text(pdf_file)
summary = summarize_text(text)
audio = text_to_audio(summary)
return summary, audio
# Define Gradio interface
# Gradio web interface with a file input, text output to display the summary
# and audio output to play the audio file. # Launches the interface
inputs = gr.File()
summary_text = gr.Text()
audio_summary = gr.Audio()
iface = gr.Interface(
fn=audio_pdf,
inputs=inputs,
outputs=[summary_text, audio_summary],
title="The Vocal PDF Summarizer",
description="I will summarize PDFs that have an abstract and transform them into audio. If an abstract is not present in the document, a message will be displayed.",
examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf",
"Article 6 BloombergGPT_ A Large Language Model for Finance.pdf",
"Article 5 A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks.pdf",
"Article 8 Llama 2_ Open Foundation and Fine-Tuned Chat Models.pdf"
]
)
iface.launch() # Launch the interface