Spaces:
Sleeping
Sleeping
# https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer | |
# Here are the imports | |
import pdfplumber | |
import torch | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from bert_score import score as bert_score | |
from io import BytesIO | |
from scipy.io.wavfile import write as write_wav | |
import gradio as gr | |
import numpy as np | |
from gtts import gTTS | |
# Here is the code | |
##Instantiating model and tokenizer. | |
pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") | |
pegasus_research_model = pegasus_research_model.to("cuda") | |
pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") | |
##Defining functions. | |
def extract_abstract(uploaded_file): | |
with pdfplumber.open(uploaded_file) as pdf: | |
abstract = "" | |
for page in pdf.pages: | |
text = page.extract_text(x_tolerance=1, y_tolerance=1) | |
if text: | |
text_lower = text.lower() | |
if "abstract" in text_lower: | |
start_index = text_lower.find("abstract") | |
end_index = text_lower.find("introduction", start_index) | |
if end_index == -1: | |
end_index = len(text) | |
abstract = text[start_index:end_index] | |
break | |
return abstract | |
def text_chunker(text, tokenizer, max_tokens): | |
tokens = tokenizer.encode(text) | |
num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0) | |
chunked_tokens = [ | |
tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks) | |
] | |
chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens] | |
return chunked_text | |
def pegasus_research_summarize(text): | |
inputs = pegasus_research_tokenizer.encode("summarize: " + text, | |
return_tensors="pt", | |
max_length=800, | |
truncation=True) | |
summary_ids = pegasus_research_model.generate(inputs.to("cuda"), | |
max_length=150, | |
min_length=40, | |
length_penalty=0.5, | |
num_beams=4, | |
early_stopping=True | |
) | |
summary = pegasus_research_tokenizer.decode(summary_ids[0], | |
skip_special_tokens=True) | |
return summary | |
def select_best_sentence(summary, reference_text): | |
sentences = summary.split('.') | |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] | |
if not sentences: | |
return "", "0.00 (Very Low Similarity)" | |
_, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True) | |
best_sentence_index = np.argmax(f1_scores) | |
best_sentence = sentences[best_sentence_index] | |
best_f1_score = round(f1_scores[best_sentence_index].item(), 2) | |
score_label = "" | |
if best_f1_score <= 0.20: | |
score_label = " (Very Low Similarity)" | |
elif best_f1_score <= 0.40: | |
score_label = " (Low Similarity)" | |
elif best_f1_score <= 0.60: | |
score_label = " (Moderate Similarity)" | |
elif best_f1_score <= 0.80: | |
score_label = " (High Similarity)" | |
else: | |
score_label = " (Very High Similarity)" | |
best_f1_score_with_label = f"{best_f1_score}{score_label}" | |
return best_sentence, best_f1_score_with_label | |
def convert_to_audio(text): | |
tts = gTTS(text, lang='en') | |
buffer = BytesIO() | |
tts.write_to_fp(buffer) | |
buffer.seek(0) | |
audio_bytes = buffer.read() | |
return audio_bytes | |
def pr_recursive_summarize(text, reference_text, recursion_l=0): | |
recursion_level = recursion_l + 1 | |
print(f"Pegasus Research level: {recursion_level}\n") | |
tokens = pegasus_research_tokenizer.tokenize(text) | |
expectedCountOfChunks = max(len(tokens) / 150, 1) | |
max_length = int(len(tokens) / expectedCountOfChunks) + 2 | |
chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800) | |
print(f"Number of chunks: {len(chunks)}") | |
summaries = [] | |
for i, chunk in enumerate(chunks, 1): | |
print(f"Chunk no.{i}:") | |
print(chunk, "\n") | |
summary = pegasus_research_summarize(chunk) | |
print("Summary:", summary) | |
summaries.append(summary) | |
print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_") | |
torch.cuda.empty_cache() | |
concatenated_summary = ' '.join(summaries) | |
tokens = pegasus_research_tokenizer.tokenize(concatenated_summary) | |
if len(tokens) > 50 and recursion_level <= 10: | |
print("Recursive") | |
return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level) | |
else: | |
final_summary = concatenated_summary | |
if len(chunks) > 1: | |
final_summary = pegasus_research_summarize(concatenated_summary) | |
sentences = final_summary.split(".") | |
sentences = [s.strip() for s in sentences if s.strip()] | |
if not sentences: | |
return None, 0.0 | |
p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en") | |
best_sentence_index = np.argmax(f1_scores) | |
best_sentence = sentences[best_sentence_index] | |
best_f1_score = f1_scores[best_sentence_index].item() | |
return best_sentence, best_f1_score | |
def summarize_and_convert_to_audio(pdf_file): | |
abstract_text = extract_abstract(pdf_file) | |
if not abstract_text: | |
return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)" | |
best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text) | |
audio_bytes = convert_to_audio(best_sentence) | |
return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)" | |
##Building the Gradio UI. | |
iface = gr.Interface( | |
fn=summarize_and_convert_to_audio, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=[ | |
gr.Audio(label="Audio"), | |
gr.Textbox(label="Summary sentence"), | |
gr.Textbox(label="Bert F1-Score") | |
], | |
title="PDF Abstract Summarizer and Audio Converter", | |
description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed. (A PDF file needs to contain the Abstract section.)" | |
) | |
iface.launch() |