Spaces:
Running
on
Zero
Running
on
Zero
import re | |
from bs4 import BeautifulSoup | |
import gradio as gr | |
import nltk | |
import spaces | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.summarizers.text_rank import TextRankSummarizer | |
from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
nltk.download('punkt_tab') | |
class ModelLoader: | |
def __init__(self): | |
self.model_checkpoint = 'AryehRotberg/Pegasus-Large-Privacy-Policy-Summarization-V2' | |
self.model = PegasusForConditionalGeneration.from_pretrained(self.model_checkpoint).to('cuda') | |
self.tokenizer = PegasusTokenizer.from_pretrained(self.model_checkpoint) | |
def predict(self, text): | |
inputs = self.tokenizer( | |
f'Summarize the following document: {text}\nSummary: ', | |
padding="max_length", | |
truncation=True, | |
max_length=1024, | |
return_tensors="pt", | |
).to('cuda') | |
outputs = self.model.generate(**inputs) | |
return self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
model_loader = ModelLoader() | |
def extractive_summarization(text: str) -> str: | |
parser = PlaintextParser.from_string(text, Tokenizer('english')) | |
summarizer = TextRankSummarizer() | |
summary = summarizer(parser.document, sentences_count=30) | |
return '\n'.join(str(sentence) for sentence in summary) | |
def clean_text(text): | |
text = BeautifulSoup(text, 'html.parser').get_text() | |
text = extractive_summarization(text) | |
return re.sub(r'\s{2,}', ' ', text).strip().replace('\n', ' ') | |
def summarize(document): | |
return model_loader.predict(clean_text(document)) | |
gui = gr.Interface( | |
fn=summarize, | |
inputs=gr.Textbox(lines=10, scale=False, placeholder='Enter privacy policy text here...', label='Document'), | |
outputs=gr.Textbox(lines=10, scale=False, label='Summarized Privacy Policy'), | |
title='Privacy Policy Summarization using Fine-Tuned Pegasus Large Model', | |
description='This tool summarizes privacy policies using extractive and abstractive summarization techniques.' | |
) | |
gui.launch() | |