arithescientist commited on
Commit
342a4a2
1 Parent(s): 02a288f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -263
app.py CHANGED
@@ -17,275 +17,118 @@ import yake
17
  from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
18
  from summarizer import Summarizer,TransformerSummarizer
19
  from transformers import pipelines
20
- #nltk.download('punkt')
21
 
22
  print("lets go")
23
 
24
-
25
- app = flask.Flask(__name__)
26
- app.config["DEBUG"] = True
27
- UPLOAD_FOLDER = './pdfs'
28
-
29
- ALLOWED_EXTENSIONS = {'txt', 'pdf'}
30
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
31
-
32
- #***************** FLASK *****************************
33
- CORS(app)
34
-
35
-
36
- def allowed_file(filename):
37
- return '.' in filename and \
38
- filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
39
-
40
-
41
-
42
- #model_name = 'laxya007/gpt2_legal'
43
- #model_name = 'facebook/bart-large-cnn'
44
- model_name = 'nlpaueb/legal-bert-base-uncased'
45
-
46
-
47
- #The setup of huggingface.co
48
-
49
- print("lets go")
50
-
51
- custom_config = AutoConfig.from_pretrained(model_name)
52
- print("lets go")
53
- custom_config.output_hidden_states=True
54
- print("lets go")
55
- custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
56
- print("lets go")
57
- custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
58
- print("lets go")
59
- bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
60
- print('Using model {}\n'.format(model_name))
61
-
62
-
63
-
64
- # main index page route
65
- @app.route('/')
66
- @cross_origin()
67
- def index():
68
- return render_template('index.html')
69
-
70
- @cross_origin()
71
- @app.route('/results')
72
- def results():
73
- return render_template('results.html')
74
-
75
-
76
-
77
- @app.route('/predict', methods=['GET', 'POST'])
78
- def uploads():
79
- if request.method == 'GET':
80
- # Get the file from post request
81
-
82
- numsent = int(request.args['number'])
83
- text = str(request.args['text'])
84
- content = text
85
-
86
-
87
- summary_text = ""
88
- for i, paragraph in enumerate(content.split("\n\n")):
89
-
90
- paragraph = paragraph.replace('\n',' ')
91
- paragraph = paragraph.replace('\t','')
92
- paragraph = ' '.join(paragraph.split())
93
- # count words in the paragraph and exclude if less than 4 words
94
- tokens = word_tokenize(paragraph)
95
- # only do real words
96
- tokens = [word for word in tokens if word.isalpha()]
97
- # print("\nTokens: {}\n".format(len(tokens)))
98
- # only do sentences with more than 1 words excl. alpha crap
99
- if len(tokens) <= 1:
100
- continue
101
- # Perhaps also ignore paragraphs with no sentence?
102
- sentences = sent_tokenize(paragraph)
103
-
104
- paragraph = ' '.join(tokens)
105
-
106
- print("\nParagraph:")
107
- print(paragraph+"\n")
108
- # T5 needs to have 'summarize' in order to work:
109
- # text = "summarize:" + paragraph
110
- text = paragraph
111
-
112
- summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
113
- # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
114
- summary_text += str(summary) + "\n\n"
115
- print("Summary:")
116
- print(summary)
117
-
118
- content2 = content.replace('\n',' ')
119
- content2 = content2.replace('\t','')
120
- summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
 
123
 
124
- # write all to file for inspection and storage
125
- all_text = "The Summary-- " + str(summary) + "\n\n\n" \
126
- + "The Larger Summary-- " + str(summary_text)
127
-
128
-
129
- all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
130
- all_text2 = all_text2.replace('?','.')
131
- all_text2 = all_text2.replace('\n',' ')
132
- all_text2 = all_text2.replace('..','.')
133
- all_text2 = all_text2.replace(',.',',')
134
- all_text2 = all_text2.replace('-- ','\n\n\n')
135
-
136
- pdf = FPDF()
137
-
138
- # Add a page
139
- pdf.add_page()
140
-
141
- pdf.set_font("Times", size = 12)
142
-
143
- # open the text file in read mode
144
- f = all_text2
145
-
146
- # insert the texts in pdf
147
- pdf.multi_cell(190, 10, txt = f, align = 'C')
148
-
149
-
150
- # save the pdf with name .pdf
151
- pdf.output("./static/legal.pdf")
152
- all_text
153
-
154
-
155
- return render_template('results.html')
156
- return None
157
-
158
-
159
-
160
-
161
- @app.route('/predictpdf', methods=['GET', 'POST'])
162
- def uploads2():
163
- if request.method == 'POST':
164
- # Get the file from post request
165
-
166
- numsent = int(request.args['number'])
167
- if 'file' not in request.files:
168
- flash('No file part')
169
- return redirect(request.url)
170
- file = request.files['file']
171
- # if user does not select file, browser also
172
- # submit an empty part without filename
173
- if file.filename == '':
174
- flash('No selected file')
175
- return redirect(request.url)
176
- if file and allowed_file(file.filename):
177
- filename = "legal.pdf"
178
- file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
179
-
180
- f = request.files['file']
181
- f.save(secure_filename(f.filename))
182
-
183
-
184
- path = os.getcwd()
185
- folder_name = 'pdfs'
186
- path = os.path.join(path, folder_name)
187
-
188
- list_of_files = []
189
- for root, dirs, files in os.walk(path):
190
- for file in files:
191
- if(file.endswith(".pdf")):
192
- # print(os.path.join(root,file))
193
- list_of_files.append(os.path.join(root,file))
194
-
195
- print("\nProcessing {} files...\n".format(len(list_of_files)))
196
- total_pages = 0
197
-
198
- for filename in list_of_files:
199
- print(filename)
200
- file = os.path.splitext(os.path.basename(filename))[0]
201
- pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
202
- total_pages += len(pages)
203
- print("\nProcessing the next {} pages...\n".format(len(pages)))
204
-
205
- # Then save all pages as images and convert them to text except the last page
206
- # TODO: create this as a function
207
- content = ""
208
- dir_name = 'images/' + file + '/'
209
- os.makedirs(dir_name, exist_ok=True)
210
- # If folder doesn't exist, then create it.
211
- for i in range(len(pages)-1):
212
- pages[i].save(dir_name + str(i) + '.jpg')
213
- # OCR the image using Google's tesseract
214
- content += pt.image_to_string(pages[i])
215
-
216
- summary_text = ""
217
- for i, paragraph in enumerate(content.split("\n\n")):
218
-
219
- paragraph = paragraph.replace('\n',' ')
220
- paragraph = paragraph.replace('\t','')
221
- paragraph = ' '.join(paragraph.split())
222
- # count words in the paragraph and exclude if less than 4 words
223
- tokens = word_tokenize(paragraph)
224
- # only do real words
225
- tokens = [word for word in tokens if word.isalpha()]
226
- # print("\nTokens: {}\n".format(len(tokens)))
227
- # only do sentences with more than 1 words excl. alpha crap
228
- if len(tokens) <= 1:
229
- continue
230
- # Perhaps also ignore paragraphs with no sentence?
231
- sentences = sent_tokenize(paragraph)
232
-
233
- paragraph = ' '.join(tokens)
234
-
235
- print("\nParagraph:")
236
- print(paragraph+"\n")
237
- # T5 needs to have 'summarize' in order to work:
238
- # text = "summarize:" + paragraph
239
- text = paragraph
240
-
241
- summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
242
- # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
243
- summary_text += str(summary) + "\n\n"
244
- print("Summary:")
245
- print(summary)
246
-
247
- content2 = content.replace('\n',' ')
248
- content2 = content2.replace('\t','')
249
- summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
250
-
251
-
252
-
253
- # write all to file for inspection and storage
254
- all_text = "The Summary-- " + str(summary) + "\n\n\n" \
255
- + "The Larger Summary-- " + str(summary_text)
256
-
257
-
258
- all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
259
- all_text2 = all_text2.replace('?','.')
260
- all_text2 = all_text2.replace('\n',' ')
261
- all_text2 = all_text2.replace('..','.')
262
- all_text2 = all_text2.replace(',.',',')
263
- all_text2 = all_text2.replace('-- ','\n\n\n')
264
-
265
- pdf = FPDF()
266
-
267
- # Add a page
268
- pdf.add_page()
269
-
270
- pdf.set_font("Times", size = 12)
271
-
272
- # open the text file in read mode
273
- f = all_text2
274
-
275
- # insert the texts in pdf
276
- pdf.multi_cell(190, 10, txt = f, align = 'C')
277
-
278
-
279
- # save the pdf with name .pdf
280
- pdf.output("./static/legal.pdf")
281
- all_text
282
-
283
-
284
- return render_template('results.html')
285
- return None
286
 
287
 
288
- import gradio as gr
 
 
 
 
 
 
289
 
290
- iface = gr.Interface(fn=index)
291
- iface.launch()
 
17
  from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
18
  from summarizer import Summarizer,TransformerSummarizer
19
  from transformers import pipelines
20
+ nltk.download('punkt')
21
 
22
  print("lets go")
23
 
24
+ def pdf(file):
25
+ #model_name = 'laxya007/gpt2_legal'
26
+ # model_name = 'facebook/bart-large-cnn'
27
+ model_name = 'nlpaueb/legal-bert-base-uncased'
28
+
29
+ # The setup of huggingface.co
30
+ custom_config = AutoConfig.from_pretrained(model_name)
31
+ custom_config.output_hidden_states=True
32
+ custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
34
+ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
35
+ print('Using model {}\n'.format(model_name))
36
+
37
+ list_of_files = file
38
+
39
+
40
+ print("\nProcessing {} files...\n".format(len(list_of_files)))
41
+ total_pages = 0
42
+
43
+ for filename in list_of_files:
44
+ print(filename)
45
+ file = os.path.splitext(os.path.basename(filename))[0]
46
+ pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
47
+ total_pages += len(pages)
48
+ print("\nProcessing the next {} pages...\n".format(len(pages)))
49
+
50
+ # Then save all pages as images and convert them to text except the last page
51
+ # TODO: create this as a function
52
+ content = ""
53
+ dir_name = 'images/' + file + '/'
54
+ os.makedirs(dir_name, exist_ok=True)
55
+ # If folder doesn't exist, then create it.
56
+ for i in range(len(pages)-1):
57
+ pages[i].save(dir_name + str(i) + '.jpg')
58
+ # OCR the image using Google's tesseract
59
+ content += pt.image_to_string(pages[i])
60
+
61
+ summary_text = ""
62
+ for i, paragraph in enumerate(content.split("\n\n")):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ paragraph = paragraph.replace('\n',' ')
65
+ paragraph = paragraph.replace('\t','')
66
+ paragraph = ' '.join(paragraph.split())
67
+ # count words in the paragraph and exclude if less than 4 words
68
+ tokens = word_tokenize(paragraph)
69
+ # only do real words
70
+ tokens = [word for word in tokens if word.isalpha()]
71
+ # print("\nTokens: {}\n".format(len(tokens)))
72
+ # only do sentences with more than 1 words excl. alpha crap
73
+ if len(tokens) <= 1:
74
+ continue
75
+ # Perhaps also ignore paragraphs with no sentence?
76
+ sentences = sent_tokenize(paragraph)
77
+
78
+ paragraph = ' '.join(tokens)
79
+
80
+ print("\nParagraph:")
81
+ print(paragraph+"\n")
82
+ # T5 needs to have 'summarize' in order to work:
83
+ # text = "summarize:" + paragraph
84
+ text = paragraph
85
+
86
+ summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
87
+ # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
88
+ summary_text += str(summary) + "\n\n"
89
+ print("Summary:")
90
+ print(summary)
91
+
92
+ content2 = content.replace('\n',' ')
93
+ content2 = content2.replace('\t','')
94
+ summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
95
+
96
+
97
+
98
+ # write all to file for inspection and storage
99
+ all_text = "The Summary-- " + str(summary) + "\n\n\n" \
100
+ + "The Larger Summary-- " + str(summary_text)
101
+
102
+
103
+ all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
104
+ all_text2 = all_text2.replace('?','.')
105
+ all_text2 = all_text2.replace('\n',' ')
106
+ all_text2 = all_text2.replace('..','.')
107
+ all_text2 = all_text2.replace(',.',',')
108
+ all_text2 = all_text2.replace('-- ','\n\n\n')
109
+
110
+ pdf = FPDF()
111
+
112
+ # Add a page
113
+ pdf.add_page()
114
+
115
+ pdf.set_font("Times", size = 12)
116
+
117
+ # open the text file in read mode
118
+ f = all_text2
119
+ return f
120
 
121
+ import gradio as gr
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
+ iface = gr.Interface(
126
+ pdf,
127
+ gr.inputs.Image(shape=(224, 224)),
128
+ gr.outputs.Label(f),
129
+ capture_session=True,
130
+ interpretation="default",
131
+ )
132
 
133
+ if __name__ == "__main__":
134
+ iface.launch(share=True)