#**************** IMPORT PACKAGES ******************** import gradio as gr import numpy as np import pytesseract as pt import pdf2image from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import os import pdfkit import yake from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig from summarizer import Summarizer,TransformerSummarizer from transformers import pipelines nltk.download('punkt') print("lets go") def pdf(file): #model_name = 'laxya007/gpt2_legal' # model_name = 'facebook/bart-large-cnn' model_name = 'nlpaueb/legal-bert-base-uncased' # The setup of huggingface.co custom_config = AutoConfig.from_pretrained(model_name) custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained(model_name) custom_model = AutoModel.from_pretrained(model_name, config=custom_config) bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340)) content = "" dir_name = 'images/' + file + '/' os.makedirs(dir_name, exist_ok=True) # If folder doesn't exist, then create it. for i in range(len(pages)-1): pages[i].save(dir_name + str(i) + '.jpg') # OCR the image using Google's tesseract content += pt.image_to_string(pages[i]) summary_text = "" for i, paragraph in enumerate(content.split("\n\n")): paragraph = paragraph.replace('\n',' ') paragraph = paragraph.replace('\t','') paragraph = ' '.join(paragraph.split()) # count words in the paragraph and exclude if less than 4 words tokens = word_tokenize(paragraph) # only do real words tokens = [word for word in tokens if word.isalpha()] # print("\nTokens: {}\n".format(len(tokens))) # only do sentences with more than 1 words excl. alpha crap if len(tokens) <= 1: continue # Perhaps also ignore paragraphs with no sentence? sentences = sent_tokenize(paragraph) paragraph = ' '.join(tokens) print("\nParagraph:") print(paragraph+"\n") # T5 needs to have 'summarize' in order to work: # text = "summarize:" + paragraph text = paragraph summary = bert_legal_model(text, min_length = 8, ratio = 0.05) # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True) summary_text += str(summary) + "\n\n" print("Summary:") print(summary) content2 = content.replace('\n',' ') content2 = content2.replace('\t','') summary = bert_legal_model(content2, min_length = 8, num_sentences=25) # write all to file for inspection and storage all_text = "The Summary-- " + str(summary) + "\n\n\n" \ + "The Larger Summary-- " + str(summary_text) all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1') all_text2 = all_text2.replace('?','.') all_text2 = all_text2.replace('\n',' ') all_text2 = all_text2.replace('..','.') all_text2 = all_text2.replace(',.',',') all_text2 = all_text2.replace('-- ','\n\n\n') pdf = FPDF() # Add a page pdf.add_page() pdf.set_font("Times", size = 12) # open the text file in read mode f = all_text2 return f iface = gr.Interface( pdf, "file", "text" ) if __name__ == "__main__": iface.launch(share=True)