summarizer / app.py
arithescientist's picture
Update app.py
9455659
raw
history blame
2.65 kB
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig, AutoModel
# model_name = 'distilbert-base-uncased'
model_name = 'nlpaueb/legal-bert-base-uncased'
#model_name = 'laxya007/gpt2_legal'
# model_name = 'facebook/bart-large-cnn'
# The setup of huggingface.co
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
print('Using model {}\n'.format(model_name))
def lincoln(content = input_text):
summary_text = ""
for i, paragraph in enumerate(content.split("\n\n")):
# get rid of empty paragraphs and one word paras and extra whitespaces
paragraph = paragraph.replace('\n',' ')
paragraph = paragraph.replace('\t','')
paragraph = ' '.join(paragraph.split())
# count words in the paragraph and exclude if less than 4 words
tokens = word_tokenize(paragraph)
# only do real words
tokens = [word for word in tokens if word.isalpha()]
# print("\nTokens: {}\n".format(len(tokens)))
# only do sentences with more than 1 words excl. alpha crap
if len(tokens) <= 1:
continue
# Perhaps also ignore paragraphs with no sentence?
sentences = sent_tokenize(paragraph)
# recreate paragraph from the only words tokens list
paragraph = ' '.join(tokens)
print("\nParagraph:")
print(paragraph+"\n")
# T5 needs to have 'summarize' in order to work:
# text = "summarize:" + paragraph
text = paragraph
# encoding the input text
summary = bert_legal_model(content, ratio = 0.01)
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
summary_text += str(summary) + "\n\n"
print("Summary:")
print(summary)
summary = bert_legal_model(content, ratio=0.1)
all_text = str(summary) + "\n\n\n" \
+ "-------- The Larger Summary --------\n" + str(summary_text)
output_text = all_text
return
iface = gr.Interface(
lincoln,
"text",
"text"
)
if __name__ == "__main__":
iface.launch(share=False)