Spaces:
Sleeping
Sleeping
arithescientist
commited on
Commit
•
3813c2d
1
Parent(s):
f5ab699
Update app.py
Browse files
app.py
CHANGED
@@ -13,40 +13,36 @@ from nltk.tokenize import word_tokenize
|
|
13 |
import os
|
14 |
import pdfkit
|
15 |
import yake
|
|
|
|
|
16 |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
|
17 |
from summarizer import Summarizer,TransformerSummarizer
|
18 |
from transformers import pipelines
|
19 |
-
nltk.download('punkt')
|
20 |
-
|
21 |
-
|
22 |
-
model_name = 'nlpaueb/legal-bert-base-uncased'
|
23 |
-
# The setup of huggingface.co
|
24 |
-
custom_config = AutoConfig.from_pretrained(model_name)
|
25 |
-
custom_config.output_hidden_states=True
|
26 |
-
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
-
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
|
28 |
-
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
29 |
-
from zipfile import ZipFile
|
30 |
-
|
31 |
-
from gtts import gTTS
|
32 |
from pdfminer.high_level import extract_text
|
|
|
33 |
|
34 |
def pdf_to_text(file_obj):
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
|
|
|
|
37 |
output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
|
38 |
output_text = output_text.replace(' ',' ')
|
39 |
output_text = output_text.replace(',.',',')
|
40 |
output_text = output_text.replace('\n',' ')
|
41 |
output_text = output_text.replace('..','.')
|
42 |
|
|
|
|
|
43 |
pdf = FPDF()
|
44 |
-
|
45 |
-
# Add a page
|
46 |
pdf.add_page()
|
47 |
-
|
48 |
pdf.set_font("Times", size = 12)
|
49 |
-
|
50 |
# open the text file in read mode
|
51 |
f = output_text
|
52 |
# insert the texts in pdf
|
@@ -55,19 +51,17 @@ def pdf_to_text(file_obj):
|
|
55 |
pdf.output("legal.pdf")
|
56 |
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
return output_text, "legal.pdf"
|
62 |
-
|
63 |
-
|
64 |
|
65 |
# path = folder_name
|
66 |
|
67 |
# return path
|
68 |
#pageObject.extractText()
|
69 |
iface = gr.Interface(fn = pdf_to_text,
|
70 |
-
inputs =
|
71 |
|
72 |
if __name__ == "__main__":
|
73 |
iface.launch(share=True)
|
|
|
13 |
import os
|
14 |
import pdfkit
|
15 |
import yake
|
16 |
+
from zipfile import ZipFile
|
17 |
+
from gtts import gTTS
|
18 |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
|
19 |
from summarizer import Summarizer,TransformerSummarizer
|
20 |
from transformers import pipelines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
from pdfminer.high_level import extract_text
|
22 |
+
nltk.download('punkt')
|
23 |
|
24 |
def pdf_to_text(file_obj):
|
25 |
+
model_name = 'nlpaueb/legal-bert-base-uncased'
|
26 |
+
# The setup of huggingface.co
|
27 |
+
custom_config = AutoConfig.from_pretrained(model_name)
|
28 |
+
custom_config.output_hidden_states=True
|
29 |
+
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
+
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
|
31 |
+
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
32 |
|
33 |
+
|
34 |
+
text = extract_text(file_obj.name)
|
35 |
output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
|
36 |
output_text = output_text.replace(' ',' ')
|
37 |
output_text = output_text.replace(',.',',')
|
38 |
output_text = output_text.replace('\n',' ')
|
39 |
output_text = output_text.replace('..','.')
|
40 |
|
41 |
+
|
42 |
+
output_text = "dbgffsdvdbg"
|
43 |
pdf = FPDF()
|
|
|
|
|
44 |
pdf.add_page()
|
|
|
45 |
pdf.set_font("Times", size = 12)
|
|
|
46 |
# open the text file in read mode
|
47 |
f = output_text
|
48 |
# insert the texts in pdf
|
|
|
51 |
pdf.output("legal.pdf")
|
52 |
|
53 |
|
54 |
+
myobj = gTTS(text=output_text, lang='en', slow=False)
|
55 |
+
myobj.save("legal.wav")
|
56 |
|
57 |
+
return "legal.wav", output_text, "legal.pdf"
|
|
|
|
|
58 |
|
59 |
# path = folder_name
|
60 |
|
61 |
# return path
|
62 |
#pageObject.extractText()
|
63 |
iface = gr.Interface(fn = pdf_to_text,
|
64 |
+
inputs ="file", outputs=["text", "file"] )
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
iface.launch(share=True)
|