arithescientist commited on
Commit
90cc1ec
·
1 Parent(s): 5957677

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -69
app.py CHANGED
@@ -16,22 +16,19 @@ from summarizer import Summarizer,TransformerSummarizer
16
  from transformers import pipelines
17
  nltk.download('punkt')
18
 
19
- print("lets go")
20
 
21
- def pdf(file):
22
- #model_name = 'laxya007/gpt2_legal'
23
- # model_name = 'facebook/bart-large-cnn'
24
- model_name = 'nlpaueb/legal-bert-base-uncased'
 
 
 
25
 
26
- # The setup of huggingface.co
27
- custom_config = AutoConfig.from_pretrained(model_name)
28
- custom_config.output_hidden_states=True
29
- custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
30
- custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
31
- bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
32
 
33
- pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
34
 
 
35
  content = ""
36
  dir_name = 'images/' + file + '/'
37
  os.makedirs(dir_name, exist_ok=True)
@@ -41,74 +38,30 @@ def pdf(file):
41
  # OCR the image using Google's tesseract
42
  content += pt.image_to_string(pages[i])
43
 
44
- summary_text = ""
45
- for i, paragraph in enumerate(content.split("\n\n")):
46
-
47
- paragraph = paragraph.replace('\n',' ')
48
- paragraph = paragraph.replace('\t','')
49
- paragraph = ' '.join(paragraph.split())
50
- # count words in the paragraph and exclude if less than 4 words
51
- tokens = word_tokenize(paragraph)
52
- # only do real words
53
- tokens = [word for word in tokens if word.isalpha()]
54
- # print("\nTokens: {}\n".format(len(tokens)))
55
- # only do sentences with more than 1 words excl. alpha crap
56
- if len(tokens) <= 1:
57
- continue
58
- # Perhaps also ignore paragraphs with no sentence?
59
- sentences = sent_tokenize(paragraph)
60
-
61
- paragraph = ' '.join(tokens)
62
-
63
- print("\nParagraph:")
64
- print(paragraph+"\n")
65
- # T5 needs to have 'summarize' in order to work:
66
- # text = "summarize:" + paragraph
67
- text = paragraph
68
-
69
- summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
70
- # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
71
- summary_text += str(summary) + "\n\n"
72
- print("Summary:")
73
- print(summary)
74
-
75
- content2 = content.replace('\n',' ')
76
- content2 = content2.replace('\t','')
77
- summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
78
 
79
-
80
-
81
- # write all to file for inspection and storage
82
- all_text = "The Summary-- " + str(summary) + "\n\n\n" \
83
- + "The Larger Summary-- " + str(summary_text)
84
-
85
-
86
- all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
87
- all_text2 = all_text2.replace('?','.')
88
- all_text2 = all_text2.replace('\n',' ')
89
- all_text2 = all_text2.replace('..','.')
90
- all_text2 = all_text2.replace(',.',',')
91
- all_text2 = all_text2.replace('-- ','\n\n\n')
92
-
93
- pdf = FPDF()
94
-
95
  # Add a page
96
  pdf.add_page()
97
-
98
  pdf.set_font("Times", size = 12)
99
-
100
  # open the text file in read mode
101
  f = all_text2
102
- return f
103
-
104
-
105
-
106
 
107
 
108
  iface = gr.Interface(
109
  pdf,
110
  "file",
111
- "text"
112
  )
113
 
114
  if __name__ == "__main__":
 
16
  from transformers import pipelines
17
  nltk.download('punkt')
18
 
 
19
 
20
+ model_name = 'nlpaueb/legal-bert-base-uncased'
21
+ # The setup of huggingface.co
22
+ custom_config = AutoConfig.from_pretrained(model_name)
23
+ custom_config.output_hidden_states=True
24
+ custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
26
+ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
27
 
 
 
 
 
 
 
28
 
29
+ def pdf(file):
30
 
31
+ pages= pdf2image.convert_from_path(pdf_path=file, dpi=400, size=(1654,2340))
32
  content = ""
33
  dir_name = 'images/' + file + '/'
34
  os.makedirs(dir_name, exist_ok=True)
 
38
  # OCR the image using Google's tesseract
39
  content += pt.image_to_string(pages[i])
40
 
41
+ output_text= bert_legal_model(content, min_length = 8, ratio = 0.05)
42
+ output_text = output_text.replace(' ',' ')
43
+ output_text = output_text .replace(',.',',')
44
+ output_text = output_text .replace('\n',' ')
45
+ output_text = output_text .replace('..','.')
46
+
47
+ pdf = FPDF()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Add a page
50
  pdf.add_page()
 
51
  pdf.set_font("Times", size = 12)
52
+
53
  # open the text file in read mode
54
  f = all_text2
55
+
56
+
57
+ return pdf.output("legal.pdf")
58
+
59
 
60
 
61
  iface = gr.Interface(
62
  pdf,
63
  "file",
64
+ "file"
65
  )
66
 
67
  if __name__ == "__main__":