import re from bs4 import BeautifulSoup import gradio as gr import nltk import spaces from sumy.nlp.tokenizers import Tokenizer from sumy.parsers.plaintext import PlaintextParser from sumy.summarizers.text_rank import TextRankSummarizer from transformers import PegasusForConditionalGeneration, PegasusTokenizer nltk.download('punkt_tab') class ModelLoader: def __init__(self): self.model_checkpoint = 'AryehRotberg/Pegasus-Large-Privacy-Policy-Summarization-V2' self.model = PegasusForConditionalGeneration.from_pretrained(self.model_checkpoint).to('cuda') self.tokenizer = PegasusTokenizer.from_pretrained(self.model_checkpoint) def predict(self, text): inputs = self.tokenizer( f'Summarize the following document: {text}\nSummary: ', padding="max_length", truncation=True, max_length=1024, return_tensors="pt", ).to('cuda') outputs = self.model.generate(**inputs) return self.tokenizer.decode(outputs[0], skip_special_tokens=True) model_loader = ModelLoader() def extractive_summarization(text: str) -> str: parser = PlaintextParser.from_string(text, Tokenizer('english')) summarizer = TextRankSummarizer() summary = summarizer(parser.document, sentences_count=30) return '\n'.join(str(sentence) for sentence in summary) def clean_text(text): text = BeautifulSoup(text, 'html.parser').get_text() text = extractive_summarization(text) return re.sub(r'\s{2,}', ' ', text).strip().replace('\n', ' ') @spaces.GPU def summarize(document): return model_loader.predict(clean_text(document)) gui = gr.Interface( fn=summarize, inputs=gr.Textbox(lines=10, scale=False, placeholder='Enter privacy policy text here...', label='Document'), outputs=gr.Textbox(lines=10, scale=False, label='Summarized Privacy Policy'), title='Privacy Policy Summarization using Fine-Tuned Pegasus Large Model', description='This tool summarizes privacy policies using extractive and abstractive summarization techniques.' ) gui.launch()