Spaces:
Sleeping
Sleeping
File size: 2,213 Bytes
0751294 17e34a5 0751294 91cbd61 fd55519 91cbd61 0751294 3813c2d 0751294 13b92eb 0751294 e9536eb 3810c93 3813c2d e9536eb aad84a4 c763d6c aad84a4 c763d6c aad84a4 13b92eb aad84a4 13b92eb aad84a4 4bdda66 e91cc11 13b92eb 31e79df 13b92eb 0751294 342a4a2 13b92eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
import os
import tempfile
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pdfkit
import yake
from zipfile import ZipFile
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer, TransformerSummarizer
from transformers import pipelines
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
nltk.download('punkt')
model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
def pdf_to_text(text, PDF):
Min = int(20)
if text == "":
# The setup of huggingface.co
file_obj = PDF
text = extract_text(file_obj.name)
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
Min = int(Min)
# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
else:
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size = 12)
f = output_text
pdf.multi_cell(190, 10, txt = f, align = 'C')
pdf.output("legal.pdf")
myobj = gTTS(text=output_text, lang='en', slow=False)
myobj.save("legal.wav")
return "legal.wav", output_text, "legal.pdf"
iface = gr.Interface(fn=pdf_to_text, inputs=["text", "file"], outputs=["audio", "text", "file"])
if __name__ == "__main__":
iface.launch() # Removed 'share=True'
|