Spaces:

mrsk1883
/

testing

Sleeping

File size: 2,893 Bytes

9ba52bb
 
 
 
 
 
c870fc7
9ba52bb
 
 
 
 
 
 
 
 
 
 
 
c870fc7
9ba52bb
c870fc7
 
 
9ba52bb
c870fc7
 
 
9ba52bb
c870fc7
9ba52bb
c870fc7
 
9ba52bb
c870fc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba52bb
 
 
 
 
c870fc7
9ba52bb
 
c870fc7
 
 
 
 
9ba52bb
 
c870fc7

import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os

model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_first_sentence(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    if sentences:
        return sentences[0]
    else:
        return text

def extract_abstract_and_summarize(pdf_file):
    try:
        with open(pdf_file, 'rb') as file:
            pdf_reader = PdfReader(file)
            abstract_text = ''

            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()

                abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)

                if abstract_match:
                    start_index = abstract_match.end()

                    # Check for the next heading or section marker
                    next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])

                    if next_section_match:
                        end_index = start_index + next_section_match.start()
                        abstract_text = text[start_index:end_index]
                    else:
                        abstract_text = text[start_index:]

                    break  # Exit loop once abstract is found

            # Summarize the extracted abstract
            inputs = tokenizer(abstract_text, return_tensors="pt")
            outputs = model.generate(**inputs)
            summary = tokenizer.decode(outputs[0])

            # Extract only the first sentence
            summary_sentence = extract_first_sentence(summary)

            # Generate audio
            speech = gTTS(text=summary_sentence, lang="en")
            speech_bytes = BytesIO()
            speech.write_to_fp(speech_bytes)

            # Return individual output values
            return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=extract_abstract_and_summarize,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    title="PDF Summarization & Audio Tool",
    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. 
    Please read the README.MD for information about the app and sample PDFs.""",
    examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],
    cache_examples=True,
)

interface.launch(share=True)