#**************** IMPORT PACKAGES ******************** import gradio as gr import numpy as np import pytesseract as pt import pdf2image import os import tempfile from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import pdfkit import yake from zipfile import ZipFile from gtts import gTTS from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig from summarizer import Summarizer, TransformerSummarizer from transformers import pipelines from pdfminer.high_level import extract_text from transformers import AutoTokenizer, AutoModelForSeq2SeqLM nltk.download('punkt') model_name = 'nlpaueb/legal-bert-base-uncased' tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") def pdf_to_text(text, PDF): Min = int(20) if text == "": # The setup of huggingface.co file_obj = PDF text = extract_text(file_obj.name) inputs = tokenizer([text], max_length=1024, return_tensors="pt") Min = int(Min) # Generate Summary summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000) output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] else: inputs = tokenizer([text], max_length=1024, return_tensors="pt") # Generate Summary summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000) output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] pdf = FPDF() pdf.add_page() pdf.set_font("Times", size = 12) f = output_text pdf.multi_cell(190, 10, txt = f, align = 'C') pdf.output("legal.pdf") myobj = gTTS(text=output_text, lang='en', slow=False) myobj.save("legal.wav") return "legal.wav", output_text, "legal.pdf" iface = gr.Interface(fn=pdf_to_text, inputs=["text", "file"], outputs=["audio", "text", "file"]) if __name__ == "__main__": iface.launch() # Removed 'share=True'