|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from gtts import gTTS |
|
from io import BytesIO |
|
import re |
|
import os |
|
|
|
model_name = "ArtifactAI/led_large_16384_arxiv_summarization" |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
def extract_first_sentence(text): |
|
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) |
|
if sentences: |
|
return sentences[0] |
|
else: |
|
return text |
|
|
|
def extract_abstract_and_summarize(pdf_file): |
|
try: |
|
with open(pdf_file, 'rb') as file: |
|
pdf_reader = PdfReader(file) |
|
abstract_text = '' |
|
|
|
for page_num in range(len(pdf_reader.pages)): |
|
page = pdf_reader.pages[page_num] |
|
text = page.extract_text() |
|
|
|
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) |
|
|
|
if abstract_match: |
|
start_index = abstract_match.end() |
|
|
|
|
|
next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:]) |
|
|
|
if next_section_match: |
|
end_index = start_index + next_section_match.start() |
|
abstract_text = text[start_index:end_index] |
|
else: |
|
abstract_text = text[start_index:] |
|
|
|
break |
|
|
|
|
|
inputs = tokenizer(abstract_text, return_tensors="pt") |
|
outputs = model.generate(**inputs) |
|
summary = tokenizer.decode(outputs[0]) |
|
|
|
|
|
summary_sentence = extract_first_sentence(summary) |
|
|
|
|
|
speech = gTTS(text=summary_sentence, lang="en") |
|
speech_bytes = BytesIO() |
|
speech.write_to_fp(speech_bytes) |
|
|
|
|
|
return summary_sentence, speech_bytes.getvalue(), abstract_text.strip() |
|
|
|
except Exception as e: |
|
raise Exception(str(e)) |
|
|
|
interface = gr.Interface( |
|
fn=extract_abstract_and_summarize, |
|
inputs=[gr.File(label="Upload PDF")], |
|
outputs=[gr.Textbox(label="Summary"), gr.Audio()], |
|
title="PDF Summarization & Audio Tool", |
|
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. |
|
Please read the README.MD for information about the app and sample PDFs.""", |
|
examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")], |
|
cache_examples=True, |
|
) |
|
|
|
interface.launch(share=True) |