micknikolic commited on
Commit
87f89c1
1 Parent(s): 1bee26e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer
2
+
3
+ # Here are the imports
4
+
5
+ import pdfplumber
6
+ import torch
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
+ from bert_score import score as bert_score
9
+ from io import BytesIO
10
+ from scipy.io.wavfile import write as write_wav
11
+ import gradio as gr
12
+ import numpy as np
13
+ from gtts import gTTS
14
+
15
+ # Here is the code
16
+
17
+ ##Instantiating model and tokenizer.
18
+
19
+ pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
20
+ pegasus_research_model = pegasus_research_model.to("cuda")
21
+ pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
22
+
23
+ ##Defining functions.
24
+
25
+ def extract_abstract(uploaded_file):
26
+ with pdfplumber.open(uploaded_file) as pdf:
27
+ abstract = ""
28
+ for page in pdf.pages:
29
+ text = page.extract_text(x_tolerance=1, y_tolerance=1)
30
+ if text:
31
+ text_lower = text.lower()
32
+ if "abstract" in text_lower:
33
+ start_index = text_lower.find("abstract")
34
+ end_index = text_lower.find("introduction", start_index)
35
+ if end_index == -1:
36
+ end_index = len(text)
37
+ abstract = text[start_index:end_index]
38
+ break
39
+ return abstract
40
+
41
+ def text_chunker(text, tokenizer, max_tokens):
42
+ tokens = tokenizer.encode(text)
43
+ num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0)
44
+ chunked_tokens = [
45
+ tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks)
46
+ ]
47
+ chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens]
48
+ return chunked_text
49
+
50
+ def pegasus_research_summarize(text):
51
+ inputs = pegasus_research_tokenizer.encode("summarize: " + text,
52
+ return_tensors="pt",
53
+ max_length=800,
54
+ truncation=True)
55
+ summary_ids = pegasus_research_model.generate(inputs.to("cuda"),
56
+ max_length=150,
57
+ min_length=40,
58
+ length_penalty=0.5,
59
+ num_beams=4,
60
+ early_stopping=True
61
+ )
62
+ summary = pegasus_research_tokenizer.decode(summary_ids[0],
63
+ skip_special_tokens=True)
64
+ return summary
65
+
66
+ def select_best_sentence(summary, reference_text):
67
+ sentences = summary.split('.')
68
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
69
+
70
+ if not sentences:
71
+ return "", "0.00 (Very Low Similarity)"
72
+
73
+ _, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True)
74
+ best_sentence_index = np.argmax(f1_scores)
75
+ best_sentence = sentences[best_sentence_index]
76
+ best_f1_score = round(f1_scores[best_sentence_index].item(), 2)
77
+
78
+ score_label = ""
79
+ if best_f1_score <= 0.20:
80
+ score_label = " (Very Low Similarity)"
81
+ elif best_f1_score <= 0.40:
82
+ score_label = " (Low Similarity)"
83
+ elif best_f1_score <= 0.60:
84
+ score_label = " (Moderate Similarity)"
85
+ elif best_f1_score <= 0.80:
86
+ score_label = " (High Similarity)"
87
+ else:
88
+ score_label = " (Very High Similarity)"
89
+
90
+ best_f1_score_with_label = f"{best_f1_score}{score_label}"
91
+
92
+ return best_sentence, best_f1_score_with_label
93
+
94
+
95
+ def convert_to_audio(text):
96
+ tts = gTTS(text, lang='en')
97
+ buffer = BytesIO()
98
+ tts.write_to_fp(buffer)
99
+ buffer.seek(0)
100
+ audio_bytes = buffer.read()
101
+
102
+ return audio_bytes
103
+
104
+ def pr_recursive_summarize(text, reference_text, recursion_l=0):
105
+ recursion_level = recursion_l + 1
106
+ print(f"Pegasus Research level: {recursion_level}\n")
107
+ tokens = pegasus_research_tokenizer.tokenize(text)
108
+ expectedCountOfChunks = max(len(tokens) / 150, 1)
109
+ max_length = int(len(tokens) / expectedCountOfChunks) + 2
110
+
111
+ chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800)
112
+ print(f"Number of chunks: {len(chunks)}")
113
+
114
+ summaries = []
115
+ for i, chunk in enumerate(chunks, 1):
116
+ print(f"Chunk no.{i}:")
117
+ print(chunk, "\n")
118
+ summary = pegasus_research_summarize(chunk)
119
+ print("Summary:", summary)
120
+ summaries.append(summary)
121
+ print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_")
122
+ torch.cuda.empty_cache()
123
+
124
+ concatenated_summary = ' '.join(summaries)
125
+ tokens = pegasus_research_tokenizer.tokenize(concatenated_summary)
126
+
127
+ if len(tokens) > 50 and recursion_level <= 10:
128
+ print("Recursive")
129
+ return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level)
130
+ else:
131
+ final_summary = concatenated_summary
132
+ if len(chunks) > 1:
133
+ final_summary = pegasus_research_summarize(concatenated_summary)
134
+
135
+ sentences = final_summary.split(".")
136
+ sentences = [s.strip() for s in sentences if s.strip()]
137
+
138
+ if not sentences:
139
+ return None, 0.0
140
+
141
+ p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en")
142
+ best_sentence_index = np.argmax(f1_scores)
143
+ best_sentence = sentences[best_sentence_index]
144
+ best_f1_score = f1_scores[best_sentence_index].item()
145
+
146
+ return best_sentence, best_f1_score
147
+
148
+ def summarize_and_convert_to_audio(pdf_file):
149
+ abstract_text = extract_abstract(pdf_file)
150
+ if not abstract_text:
151
+ return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)"
152
+
153
+ best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text)
154
+ audio_bytes = convert_to_audio(best_sentence)
155
+
156
+ return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)"
157
+
158
+ ##Building the Gradio UI.
159
+
160
+ iface = gr.Interface(
161
+ fn=summarize_and_convert_to_audio,
162
+ inputs=gr.File(label="Upload PDF"),
163
+ outputs=[
164
+ gr.Audio(label="Audio"),
165
+ gr.Textbox(label="Summary sentence"),
166
+ gr.Textbox(label="Bert F1-Score")
167
+ ],
168
+ title="PDF Abstract Summarizer and Audio Converter",
169
+ description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed."
170
+ )
171
+
172
+ iface.launch()