Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
import pytesseract as pt | |
import pdf2image | |
from fpdf import FPDF | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
import os | |
import pdfkit | |
import yake | |
from summarizer import Summarizer,TransformerSummarizer | |
from transformers import pipelines | |
nltk.download('punkt') | |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig, AutoModel | |
# model_name = 'distilbert-base-uncased' | |
model_name = 'nlpaueb/legal-bert-base-uncased' | |
#model_name = 'laxya007/gpt2_legal' | |
# model_name = 'facebook/bart-large-cnn' | |
# The setup of huggingface.co | |
custom_config = AutoConfig.from_pretrained(model_name) | |
custom_config.output_hidden_states=True | |
custom_tokenizer = AutoTokenizer.from_pretrained(model_name) | |
custom_model = AutoModel.from_pretrained(model_name, config=custom_config) | |
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) | |
print('Using model {}\n'.format(model_name)) | |
def lincoln(content = input_text): | |
summary_text = "" | |
for i, paragraph in enumerate(content.split("\n\n")): | |
# get rid of empty paragraphs and one word paras and extra whitespaces | |
paragraph = paragraph.replace('\n',' ') | |
paragraph = paragraph.replace('\t','') | |
paragraph = ' '.join(paragraph.split()) | |
# count words in the paragraph and exclude if less than 4 words | |
tokens = word_tokenize(paragraph) | |
# only do real words | |
tokens = [word for word in tokens if word.isalpha()] | |
# print("\nTokens: {}\n".format(len(tokens))) | |
# only do sentences with more than 1 words excl. alpha crap | |
if len(tokens) <= 1: | |
continue | |
# Perhaps also ignore paragraphs with no sentence? | |
sentences = sent_tokenize(paragraph) | |
# recreate paragraph from the only words tokens list | |
paragraph = ' '.join(tokens) | |
print("\nParagraph:") | |
print(paragraph+"\n") | |
# T5 needs to have 'summarize' in order to work: | |
# text = "summarize:" + paragraph | |
text = paragraph | |
# encoding the input text | |
summary = bert_legal_model(content, ratio = 0.01) | |
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True) | |
summary_text += str(summary) + "\n\n" | |
print("Summary:") | |
print(summary) | |
summary = bert_legal_model(content, ratio=0.1) | |
all_text = str(summary) + "\n\n\n" \ | |
+ "-------- The Larger Summary --------\n" + str(summary_text) | |
output_text = all_text | |
return | |
iface = gr.Interface( | |
lincoln, | |
"text", | |
"text" | |
) | |
if __name__ == "__main__": | |
iface.launch(share=False) |