File size: 2,213 Bytes
0751294
17e34a5
0751294
 
 
91cbd61
fd55519
91cbd61
0751294
 
 
 
 
 
 
3813c2d
 
0751294
13b92eb
0751294
e9536eb
3810c93
 
3813c2d
e9536eb
aad84a4
 
c763d6c
aad84a4
c763d6c
aad84a4
 
 
 
 
 
 
 
 
 
13b92eb
aad84a4
 
 
 
 
13b92eb
aad84a4
 
4bdda66
 
 
 
 
 
e91cc11
 
 
 
13b92eb
31e79df
13b92eb
0751294
342a4a2
13b92eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
import os

import tempfile
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pdfkit
import yake
from zipfile import ZipFile
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer, TransformerSummarizer
from transformers import pipelines
from pdfminer.high_level import extract_text

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
nltk.download('punkt')

model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

def pdf_to_text(text, PDF):
   Min = int(20)
   if text == "":
      # The setup of huggingface.co
      file_obj = PDF
      text = extract_text(file_obj.name)
      inputs = tokenizer([text], max_length=1024, return_tensors="pt")
      
      Min = int(Min)
      # Generate Summary
      summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000)
      output_text  = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
   else:   
      inputs = tokenizer([text], max_length=1024, return_tensors="pt")      
      # Generate Summary
      summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000)
      output_text  = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
   
   pdf = FPDF()  
   pdf.add_page()
   pdf.set_font("Times", size = 12)
   f = output_text
   pdf.multi_cell(190, 10, txt = f, align = 'C')
   pdf.output("legal.pdf")  
     
   myobj = gTTS(text=output_text, lang='en', slow=False)
   myobj.save("legal.wav")
   
   return "legal.wav", output_text, "legal.pdf"
    
iface = gr.Interface(fn=pdf_to_text, inputs=["text", "file"], outputs=["audio", "text", "file"])

if __name__ == "__main__":
    iface.launch()  # Removed 'share=True'