#**************** IMPORT PACKAGES ******************** import gradio as gr import numpy as np import pytesseract as pt import pdf2image import os import tempfile from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import os import pdfkit import yake from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig from summarizer import Summarizer,TransformerSummarizer from transformers import pipelines nltk.download('punkt') model_name = 'nlpaueb/legal-bert-base-uncased' # The setup of huggingface.co custom_config = AutoConfig.from_pretrained(model_name) custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained(model_name) custom_model = AutoModel.from_pretrained(model_name, config=custom_config) bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) from zipfile import ZipFile from gtts import gTTS from pdfminer.high_level import extract_text def pdf_to_text(file_obj): text = extract_text(file_obj.name) output_text= bert_legal_model(text, min_length = 8, ratio = 0.05) output_text = output_text.replace(' ',' ') output_text = output_text .replace(',.',',') output_text = output_text .replace('\n',' ') output_text = output_text .replace('..','.') pdf = FPDF() # Add a page pdf.add_page() pdf.set_font("Times", size = 12) # open the text file in read mode f = output_text # insert the texts in pdf pdf.multi_cell(190, 10, txt = f, align = 'C') # save the pdf with name .pdf pdf.output("legal.pdf") all_text myobj = gTTS(text=output_text, lang='en', slow=False) myobj.save("legal.wav") return 'legal.wav', output_text, "legal.pdf" # path = folder_name # return path #pageObject.extractText() iface = gr.Interface(fn = pdf_to_text, inputs = "file", outputs=["audio","text", "file"] ) if __name__ == "__main__": iface.launch(share=True)