arithescientist commited on
Commit
3813c2d
1 Parent(s): f5ab699

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -24
app.py CHANGED
@@ -13,40 +13,36 @@ from nltk.tokenize import word_tokenize
13
  import os
14
  import pdfkit
15
  import yake
 
 
16
  from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
17
  from summarizer import Summarizer,TransformerSummarizer
18
  from transformers import pipelines
19
- nltk.download('punkt')
20
-
21
-
22
- model_name = 'nlpaueb/legal-bert-base-uncased'
23
- # The setup of huggingface.co
24
- custom_config = AutoConfig.from_pretrained(model_name)
25
- custom_config.output_hidden_states=True
26
- custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
27
- custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
28
- bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
29
- from zipfile import ZipFile
30
-
31
- from gtts import gTTS
32
  from pdfminer.high_level import extract_text
 
33
 
34
  def pdf_to_text(file_obj):
35
- text = extract_text(file_obj.name)
 
 
 
 
 
 
36
 
 
 
37
  output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
38
  output_text = output_text.replace(' ',' ')
39
  output_text = output_text.replace(',.',',')
40
  output_text = output_text.replace('\n',' ')
41
  output_text = output_text.replace('..','.')
42
 
 
 
43
  pdf = FPDF()
44
-
45
- # Add a page
46
  pdf.add_page()
47
-
48
  pdf.set_font("Times", size = 12)
49
-
50
  # open the text file in read mode
51
  f = output_text
52
  # insert the texts in pdf
@@ -55,19 +51,17 @@ def pdf_to_text(file_obj):
55
  pdf.output("legal.pdf")
56
 
57
 
58
- #myobj = gTTS(text=output_text, lang='en', slow=False)
59
- #myobj.save("legal.wav")
60
 
61
- return output_text, "legal.pdf"
62
-
63
-
64
 
65
  # path = folder_name
66
 
67
  # return path
68
  #pageObject.extractText()
69
  iface = gr.Interface(fn = pdf_to_text,
70
- inputs = "file", outputs=["text", "file"] )
71
 
72
  if __name__ == "__main__":
73
  iface.launch(share=True)
 
13
  import os
14
  import pdfkit
15
  import yake
16
+ from zipfile import ZipFile
17
+ from gtts import gTTS
18
  from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
19
  from summarizer import Summarizer,TransformerSummarizer
20
  from transformers import pipelines
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  from pdfminer.high_level import extract_text
22
+ nltk.download('punkt')
23
 
24
  def pdf_to_text(file_obj):
25
+ model_name = 'nlpaueb/legal-bert-base-uncased'
26
+ # The setup of huggingface.co
27
+ custom_config = AutoConfig.from_pretrained(model_name)
28
+ custom_config.output_hidden_states=True
29
+ custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
31
+ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
32
 
33
+
34
+ text = extract_text(file_obj.name)
35
  output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
36
  output_text = output_text.replace(' ',' ')
37
  output_text = output_text.replace(',.',',')
38
  output_text = output_text.replace('\n',' ')
39
  output_text = output_text.replace('..','.')
40
 
41
+
42
+ output_text = "dbgffsdvdbg"
43
  pdf = FPDF()
 
 
44
  pdf.add_page()
 
45
  pdf.set_font("Times", size = 12)
 
46
  # open the text file in read mode
47
  f = output_text
48
  # insert the texts in pdf
 
51
  pdf.output("legal.pdf")
52
 
53
 
54
+ myobj = gTTS(text=output_text, lang='en', slow=False)
55
+ myobj.save("legal.wav")
56
 
57
+ return "legal.wav", output_text, "legal.pdf"
 
 
58
 
59
  # path = folder_name
60
 
61
  # return path
62
  #pageObject.extractText()
63
  iface = gr.Interface(fn = pdf_to_text,
64
+ inputs ="file", outputs=["text", "file"] )
65
 
66
  if __name__ == "__main__":
67
  iface.launch(share=True)