arithescientist commited on
Commit
aad84a4
·
1 Parent(s): a39d635

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -20
app.py CHANGED
@@ -23,25 +23,33 @@ from pdfminer.high_level import extract_text
23
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
  nltk.download('punkt')
25
 
26
- def pdf_to_text(PDF, Min):
27
- model_name = 'nlpaueb/legal-bert-base-uncased'
28
- # The setup of huggingface.co
29
- file_obj = PDF
30
- #n = int(Percent.replace('%', ''))
31
- tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
32
-
33
- model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
34
-
35
- text = extract_text(file_obj.name)
36
-
37
- inputs = tokenizer([text], max_length=1024, return_tensors="pt")
38
-
39
- Min = int(Min)
40
- # Generate Summary
41
- summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000)
42
- output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
43
 
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  #output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
47
 
@@ -62,12 +70,10 @@ def pdf_to_text(PDF, Min):
62
  return "legal.wav", output_text, "legal.pdf"
63
 
64
 
65
- # path = folder_name
66
-
67
  # return path
68
  #pageObject.extractText()
69
  iface = gr.Interface(fn = pdf_to_text,
70
- inputs =["file", "text"], outputs=["audio","text", "file"] )
71
 
72
  if __name__ == "__main__":
73
  iface.launch(share=True)
 
23
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
  nltk.download('punkt')
25
 
26
+ model_name = 'nlpaueb/legal-bert-base-uncased'
27
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
30
 
31
+ def pdf_to_text(text, PDF):
32
+ Min = int(20)
33
+ if text == "":
34
+ # The setup of huggingface.co
35
+ file_obj = PDF
36
+ #n = int(Percent.replace('%', ''))
37
+
38
+ text = extract_text(file_obj.name)
39
+ inputs = tokenizer([text], max_length=1024, return_tensors="pt")
40
+
41
+ Min = int(Min)
42
+ # Generate Summary
43
+ summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000)
44
+ output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
45
+
46
+ else:
47
+ inputs = tokenizer([text], max_length=1024, return_tensors="pt")
48
+ # Generate Summary
49
+
50
+ summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000)
51
+ output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
52
+
53
 
54
  #output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
55
 
 
70
  return "legal.wav", output_text, "legal.pdf"
71
 
72
 
 
 
73
  # return path
74
  #pageObject.extractText()
75
  iface = gr.Interface(fn = pdf_to_text,
76
+ inputs =["text", "file", "text"], outputs=["audio","text", "file"] )
77
 
78
  if __name__ == "__main__":
79
  iface.launch(share=True)