mrsk1883 commited on
Commit
e12a777
·
1 Parent(s): b404b25

Upload 2 files

Browse files
Files changed (2) hide show
  1. app (3).py +67 -0
  2. requirements (3).txt +14 -0
app (3).py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
4
+ from gtts import gTTS
5
+ from io import BytesIO
6
+ import re
7
+ import os
8
+
9
+ # Load the LED-large model for summarization
10
+ model_name = "pszemraj/led-large-book-summary"
11
+ summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)
12
+
13
+ def extract_abstract_and_summarize(pdf_file):
14
+ try:
15
+ if pdf_file is None:
16
+ raise ValueError("PDF file is not provided.")
17
+
18
+ with open(pdf_file, "rb") as file:
19
+ pdf_reader = PdfReader(file)
20
+ abstract_text = ""
21
+ for page_num in range(len(pdf_reader.pages)):
22
+ page = pdf_reader.pages[page_num]
23
+ text = page.extract_text()
24
+ abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
25
+ if abstract_match:
26
+ start_index = abstract_match.end()
27
+ introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
28
+ if introduction_match:
29
+ end_index = start_index + introduction_match.start()
30
+ else:
31
+ end_index = None
32
+ abstract_text = text[start_index:end_index]
33
+ break
34
+
35
+ # Summarize the extracted abstract using the LED-large model with a specific max_length
36
+ result = summarizer(abstract_text, max_length=81)
37
+
38
+ # Extract only the first sentence from the summary
39
+ if result and isinstance(result, list) and len(result) > 0:
40
+ summary = result[0].get('summary_text', 'Summary not available.')
41
+ # Extracting the first sentence
42
+ first_sentence = summary.split('.')[0] + '.'
43
+ else:
44
+ first_sentence = "Summary not available."
45
+
46
+ # Generate audio
47
+ speech = gTTS(text=first_sentence, lang="en")
48
+ speech_bytes = BytesIO()
49
+ speech.write_to_fp(speech_bytes)
50
+
51
+ # Return individual output values
52
+ return first_sentence, speech_bytes.getvalue(), abstract_text.strip()
53
+
54
+ except Exception as e:
55
+ raise Exception(str(e))
56
+
57
+ interface = gr.Interface(
58
+ fn=extract_abstract_and_summarize,
59
+ inputs=[gr.File(label="Upload PDF")],
60
+ outputs=[gr.Textbox(label="Summary"), gr.Audio()],
61
+ title="PDF Summarization & Audio Generation Tool",
62
+ description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model into one sentence summary, and generates an audio of it. Only upload PDFs with abstracts. Example
63
+ PDF's are given below, and please click on them to see the summarized text and audio generated. Please read the README.MD for more information about the app.""",
64
+ examples=[[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],[os.path.join(os.path.dirname(__file__), "Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf")]],cache_examples=True,
65
+ )
66
+
67
+ interface.launch()
requirements (3).txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ PyPDF2
4
+ gtts
5
+ torch
6
+ numpy
7
+ pytest
8
+ sphinx
9
+ huggingface-hub
10
+ IPython
11
+ torchvision
12
+ torchaudio
13
+ tensorflow
14
+ flax