testing / app.py
mrsk1883's picture
Update app.py
c870fc7
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def extract_first_sentence(text):
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
if sentences:
return sentences[0]
else:
return text
def extract_abstract_and_summarize(pdf_file):
try:
with open(pdf_file, 'rb') as file:
pdf_reader = PdfReader(file)
abstract_text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
if abstract_match:
start_index = abstract_match.end()
# Check for the next heading or section marker
next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])
if next_section_match:
end_index = start_index + next_section_match.start()
abstract_text = text[start_index:end_index]
else:
abstract_text = text[start_index:]
break # Exit loop once abstract is found
# Summarize the extracted abstract
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0])
# Extract only the first sentence
summary_sentence = extract_first_sentence(summary)
# Generate audio
speech = gTTS(text=summary_sentence, lang="en")
speech_bytes = BytesIO()
speech.write_to_fp(speech_bytes)
# Return individual output values
return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
except Exception as e:
raise Exception(str(e))
interface = gr.Interface(
fn=extract_abstract_and_summarize,
inputs=[gr.File(label="Upload PDF")],
outputs=[gr.Textbox(label="Summary"), gr.Audio()],
title="PDF Summarization & Audio Tool",
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts.
Please read the README.MD for information about the app and sample PDFs.""",
examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],
cache_examples=True,
)
interface.launch(share=True)