File size: 2,893 Bytes
9ba52bb
 
 
 
 
 
c870fc7
9ba52bb
 
 
 
 
 
 
 
 
 
 
 
c870fc7
9ba52bb
c870fc7
 
 
9ba52bb
c870fc7
 
 
9ba52bb
c870fc7
9ba52bb
c870fc7
 
9ba52bb
c870fc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba52bb
 
 
 
 
c870fc7
9ba52bb
 
c870fc7
 
 
 
 
9ba52bb
 
c870fc7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os

model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_first_sentence(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    if sentences:
        return sentences[0]
    else:
        return text

def extract_abstract_and_summarize(pdf_file):
    try:
        with open(pdf_file, 'rb') as file:
            pdf_reader = PdfReader(file)
            abstract_text = ''

            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()

                abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)

                if abstract_match:
                    start_index = abstract_match.end()

                    # Check for the next heading or section marker
                    next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:])

                    if next_section_match:
                        end_index = start_index + next_section_match.start()
                        abstract_text = text[start_index:end_index]
                    else:
                        abstract_text = text[start_index:]

                    break  # Exit loop once abstract is found

            # Summarize the extracted abstract
            inputs = tokenizer(abstract_text, return_tensors="pt")
            outputs = model.generate(**inputs)
            summary = tokenizer.decode(outputs[0])

            # Extract only the first sentence
            summary_sentence = extract_first_sentence(summary)

            # Generate audio
            speech = gTTS(text=summary_sentence, lang="en")
            speech_bytes = BytesIO()
            speech.write_to_fp(speech_bytes)

            # Return individual output values
            return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=extract_abstract_and_summarize,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    title="PDF Summarization & Audio Tool",
    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. 
    Please read the README.MD for information about the app and sample PDFs.""",
    examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],
    cache_examples=True,
)

interface.launch(share=True)