Spaces:

arithescientist
/

summarizer

Runtime error

App Files Files Community

summarizer / app.py

arithescientist

Update app.py

9455659 over 2 years ago

raw

history blame

2.65 kB

	import gradio as gr
	import numpy as np
	import pytesseract as pt
	import pdf2image
	from fpdf import FPDF
	import re
	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.tokenize import word_tokenize
	import os
	import pdfkit
	import yake
	from summarizer import Summarizer,TransformerSummarizer
	from transformers import pipelines
	nltk.download('punkt')
	from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig, AutoModel
	# model_name = 'distilbert-base-uncased'
	model_name = 'nlpaueb/legal-bert-base-uncased'
	#model_name = 'laxya007/gpt2_legal'
	# model_name = 'facebook/bart-large-cnn'

	# The setup of huggingface.co
	custom_config = AutoConfig.from_pretrained(model_name)
	custom_config.output_hidden_states=True
	custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
	custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
	bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
	print('Using model {}\n'.format(model_name))



	def lincoln(content = input_text):
	summary_text = ""
	for i, paragraph in enumerate(content.split("\n\n")):
	# get rid of empty paragraphs and one word paras and extra whitespaces
	paragraph = paragraph.replace('\n',' ')
	paragraph = paragraph.replace('\t','')
	paragraph = ' '.join(paragraph.split())
	# count words in the paragraph and exclude if less than 4 words
	tokens = word_tokenize(paragraph)
	# only do real words
	tokens = [word for word in tokens if word.isalpha()]
	# print("\nTokens: {}\n".format(len(tokens)))
	# only do sentences with more than 1 words excl. alpha crap
	if len(tokens) <= 1:
	continue
	# Perhaps also ignore paragraphs with no sentence?
	sentences = sent_tokenize(paragraph)

	# recreate paragraph from the only words tokens list
	paragraph = ' '.join(tokens)

	print("\nParagraph:")
	print(paragraph+"\n")
	# T5 needs to have 'summarize' in order to work:
	# text = "summarize:" + paragraph
	text = paragraph
	# encoding the input text

	summary = bert_legal_model(content, ratio = 0.01)
	# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
	summary_text += str(summary) + "\n\n"
	print("Summary:")
	print(summary)

	summary = bert_legal_model(content, ratio=0.1)

	all_text = str(summary) + "\n\n\n" \
	+ "-------- The Larger Summary --------\n" + str(summary_text)
	output_text = all_text
	return


	iface = gr.Interface(
	lincoln,
	"text",
	"text"
	)

	if __name__ == "__main__":
	iface.launch(share=False)