Spaces:
Sleeping
Sleeping
micknikolic
commited on
Commit
•
87f89c1
1
Parent(s):
1bee26e
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer
|
2 |
+
|
3 |
+
# Here are the imports
|
4 |
+
|
5 |
+
import pdfplumber
|
6 |
+
import torch
|
7 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
8 |
+
from bert_score import score as bert_score
|
9 |
+
from io import BytesIO
|
10 |
+
from scipy.io.wavfile import write as write_wav
|
11 |
+
import gradio as gr
|
12 |
+
import numpy as np
|
13 |
+
from gtts import gTTS
|
14 |
+
|
15 |
+
# Here is the code
|
16 |
+
|
17 |
+
##Instantiating model and tokenizer.
|
18 |
+
|
19 |
+
pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
|
20 |
+
pegasus_research_model = pegasus_research_model.to("cuda")
|
21 |
+
pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
|
22 |
+
|
23 |
+
##Defining functions.
|
24 |
+
|
25 |
+
def extract_abstract(uploaded_file):
|
26 |
+
with pdfplumber.open(uploaded_file) as pdf:
|
27 |
+
abstract = ""
|
28 |
+
for page in pdf.pages:
|
29 |
+
text = page.extract_text(x_tolerance=1, y_tolerance=1)
|
30 |
+
if text:
|
31 |
+
text_lower = text.lower()
|
32 |
+
if "abstract" in text_lower:
|
33 |
+
start_index = text_lower.find("abstract")
|
34 |
+
end_index = text_lower.find("introduction", start_index)
|
35 |
+
if end_index == -1:
|
36 |
+
end_index = len(text)
|
37 |
+
abstract = text[start_index:end_index]
|
38 |
+
break
|
39 |
+
return abstract
|
40 |
+
|
41 |
+
def text_chunker(text, tokenizer, max_tokens):
|
42 |
+
tokens = tokenizer.encode(text)
|
43 |
+
num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0)
|
44 |
+
chunked_tokens = [
|
45 |
+
tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks)
|
46 |
+
]
|
47 |
+
chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens]
|
48 |
+
return chunked_text
|
49 |
+
|
50 |
+
def pegasus_research_summarize(text):
|
51 |
+
inputs = pegasus_research_tokenizer.encode("summarize: " + text,
|
52 |
+
return_tensors="pt",
|
53 |
+
max_length=800,
|
54 |
+
truncation=True)
|
55 |
+
summary_ids = pegasus_research_model.generate(inputs.to("cuda"),
|
56 |
+
max_length=150,
|
57 |
+
min_length=40,
|
58 |
+
length_penalty=0.5,
|
59 |
+
num_beams=4,
|
60 |
+
early_stopping=True
|
61 |
+
)
|
62 |
+
summary = pegasus_research_tokenizer.decode(summary_ids[0],
|
63 |
+
skip_special_tokens=True)
|
64 |
+
return summary
|
65 |
+
|
66 |
+
def select_best_sentence(summary, reference_text):
|
67 |
+
sentences = summary.split('.')
|
68 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
69 |
+
|
70 |
+
if not sentences:
|
71 |
+
return "", "0.00 (Very Low Similarity)"
|
72 |
+
|
73 |
+
_, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True)
|
74 |
+
best_sentence_index = np.argmax(f1_scores)
|
75 |
+
best_sentence = sentences[best_sentence_index]
|
76 |
+
best_f1_score = round(f1_scores[best_sentence_index].item(), 2)
|
77 |
+
|
78 |
+
score_label = ""
|
79 |
+
if best_f1_score <= 0.20:
|
80 |
+
score_label = " (Very Low Similarity)"
|
81 |
+
elif best_f1_score <= 0.40:
|
82 |
+
score_label = " (Low Similarity)"
|
83 |
+
elif best_f1_score <= 0.60:
|
84 |
+
score_label = " (Moderate Similarity)"
|
85 |
+
elif best_f1_score <= 0.80:
|
86 |
+
score_label = " (High Similarity)"
|
87 |
+
else:
|
88 |
+
score_label = " (Very High Similarity)"
|
89 |
+
|
90 |
+
best_f1_score_with_label = f"{best_f1_score}{score_label}"
|
91 |
+
|
92 |
+
return best_sentence, best_f1_score_with_label
|
93 |
+
|
94 |
+
|
95 |
+
def convert_to_audio(text):
|
96 |
+
tts = gTTS(text, lang='en')
|
97 |
+
buffer = BytesIO()
|
98 |
+
tts.write_to_fp(buffer)
|
99 |
+
buffer.seek(0)
|
100 |
+
audio_bytes = buffer.read()
|
101 |
+
|
102 |
+
return audio_bytes
|
103 |
+
|
104 |
+
def pr_recursive_summarize(text, reference_text, recursion_l=0):
|
105 |
+
recursion_level = recursion_l + 1
|
106 |
+
print(f"Pegasus Research level: {recursion_level}\n")
|
107 |
+
tokens = pegasus_research_tokenizer.tokenize(text)
|
108 |
+
expectedCountOfChunks = max(len(tokens) / 150, 1)
|
109 |
+
max_length = int(len(tokens) / expectedCountOfChunks) + 2
|
110 |
+
|
111 |
+
chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800)
|
112 |
+
print(f"Number of chunks: {len(chunks)}")
|
113 |
+
|
114 |
+
summaries = []
|
115 |
+
for i, chunk in enumerate(chunks, 1):
|
116 |
+
print(f"Chunk no.{i}:")
|
117 |
+
print(chunk, "\n")
|
118 |
+
summary = pegasus_research_summarize(chunk)
|
119 |
+
print("Summary:", summary)
|
120 |
+
summaries.append(summary)
|
121 |
+
print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_")
|
122 |
+
torch.cuda.empty_cache()
|
123 |
+
|
124 |
+
concatenated_summary = ' '.join(summaries)
|
125 |
+
tokens = pegasus_research_tokenizer.tokenize(concatenated_summary)
|
126 |
+
|
127 |
+
if len(tokens) > 50 and recursion_level <= 10:
|
128 |
+
print("Recursive")
|
129 |
+
return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level)
|
130 |
+
else:
|
131 |
+
final_summary = concatenated_summary
|
132 |
+
if len(chunks) > 1:
|
133 |
+
final_summary = pegasus_research_summarize(concatenated_summary)
|
134 |
+
|
135 |
+
sentences = final_summary.split(".")
|
136 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
137 |
+
|
138 |
+
if not sentences:
|
139 |
+
return None, 0.0
|
140 |
+
|
141 |
+
p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en")
|
142 |
+
best_sentence_index = np.argmax(f1_scores)
|
143 |
+
best_sentence = sentences[best_sentence_index]
|
144 |
+
best_f1_score = f1_scores[best_sentence_index].item()
|
145 |
+
|
146 |
+
return best_sentence, best_f1_score
|
147 |
+
|
148 |
+
def summarize_and_convert_to_audio(pdf_file):
|
149 |
+
abstract_text = extract_abstract(pdf_file)
|
150 |
+
if not abstract_text:
|
151 |
+
return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)"
|
152 |
+
|
153 |
+
best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text)
|
154 |
+
audio_bytes = convert_to_audio(best_sentence)
|
155 |
+
|
156 |
+
return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)"
|
157 |
+
|
158 |
+
##Building the Gradio UI.
|
159 |
+
|
160 |
+
iface = gr.Interface(
|
161 |
+
fn=summarize_and_convert_to_audio,
|
162 |
+
inputs=gr.File(label="Upload PDF"),
|
163 |
+
outputs=[
|
164 |
+
gr.Audio(label="Audio"),
|
165 |
+
gr.Textbox(label="Summary sentence"),
|
166 |
+
gr.Textbox(label="Bert F1-Score")
|
167 |
+
],
|
168 |
+
title="PDF Abstract Summarizer and Audio Converter",
|
169 |
+
description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed."
|
170 |
+
)
|
171 |
+
|
172 |
+
iface.launch()
|