AryehRotberg's picture
Modified label in input textbox.
44355f6
import re
from bs4 import BeautifulSoup
import gradio as gr
import nltk
import spaces
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
nltk.download('punkt_tab')
class ModelLoader:
def __init__(self):
self.model_checkpoint = 'AryehRotberg/Pegasus-Large-Privacy-Policy-Summarization-V2'
self.model = PegasusForConditionalGeneration.from_pretrained(self.model_checkpoint).to('cuda')
self.tokenizer = PegasusTokenizer.from_pretrained(self.model_checkpoint)
def predict(self, text):
inputs = self.tokenizer(
f'Summarize the following document: {text}\nSummary: ',
padding="max_length",
truncation=True,
max_length=1024,
return_tensors="pt",
).to('cuda')
outputs = self.model.generate(**inputs)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
model_loader = ModelLoader()
def extractive_summarization(text: str) -> str:
parser = PlaintextParser.from_string(text, Tokenizer('english'))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, sentences_count=30)
return '\n'.join(str(sentence) for sentence in summary)
def clean_text(text):
text = BeautifulSoup(text, 'html.parser').get_text()
text = extractive_summarization(text)
return re.sub(r'\s{2,}', ' ', text).strip().replace('\n', ' ')
@spaces.GPU
def summarize(document):
return model_loader.predict(clean_text(document))
gui = gr.Interface(
fn=summarize,
inputs=gr.Textbox(lines=10, scale=False, placeholder='Enter privacy policy text here...', label='Document'),
outputs=gr.Textbox(lines=10, scale=False, label='Summarized Privacy Policy'),
title='Privacy Policy Summarization using Fine-Tuned Pegasus Large Model',
description='This tool summarizes privacy policies using extractive and abstractive summarization techniques.'
)
gui.launch()