cd14 commited on
Commit
fcf4786
·
1 Parent(s): 67c3f04

test hotdog app

Browse files
Files changed (2) hide show
  1. app.py +483 -13
  2. requirements.txt +2 -2
app.py CHANGED
@@ -1,20 +1,490 @@
 
1
  import streamlit as st
2
- from transformers import pipeline
3
- from PIL import Image
 
 
 
 
 
 
4
 
5
- pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
6
 
7
- st.title("Hot Dog? Or Not?")
8
 
9
- file_name = st.file_uploader("Upload a hot dog candidate image")
 
10
 
11
- if file_name is not None:
12
- col1, col2 = st.columns(2)
 
 
 
 
13
 
14
- image = Image.open(file_name)
15
- col1.image(image, use_column_width=True)
16
- predictions = pipeline(image)
17
 
18
- col2.header("Probabilities")
19
- for p in predictions:
20
- col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import arg
2
  import streamlit as st
3
+ import pandas as pd
4
+ import PIL
5
+ import re
6
+ from io import StringIO
7
+ import boto3
8
+ from urlextract import URLExtract
9
+ import time
10
+ # from joblib import dump, load
11
 
12
+ import joblib
13
 
14
+ from bokeh.models.widgets import Div
15
 
16
+ import email
17
+ #from ipyfilechooser import FileChooser
18
 
19
+ #from IPython.display import display
20
+ from io import BytesIO
21
+ from bs4 import BeautifulSoup
22
+ import matplotlib.pyplot as plt
23
+ import numpy as np
24
+ import timeit
25
 
 
 
 
26
 
27
+ def table_data():
28
+ # creating table data
29
+ field = [
30
+ 'Data Scientist',
31
+ 'Dataset',
32
+ 'Algorithm',
33
+ 'Framework',
34
+ 'Ensemble',
35
+ 'Domain',
36
+ 'Model Size'
37
+ ]
38
+
39
+ data = [
40
+ 'Chen Song',
41
+ 'Internal + Campaign monitor',
42
+ 'Random Forest',
43
+ 'Sci-kit learn',
44
+ 'Bootstrapping',
45
+ 'Bootstrapping Aggregation',
46
+ '4 KB'
47
+ ]
48
+
49
+ data = {
50
+ 'Field': field,
51
+ 'Data': data
52
+ }
53
+
54
+ df = pd.DataFrame.from_dict(data)
55
+
56
+ return df
57
+
58
+
59
+ def url_button(button_name, url):
60
+ if st.button(button_name):
61
+ js = """window.open('{url}')""".format(url=url) # New tab or window
62
+ html = '<img src onerror="{}">'.format(js)
63
+ div = Div(text=html)
64
+ st.bokeh_chart(div)
65
+
66
+
67
+ def get_industry_code_dict(training_dataset):
68
+ training_dataset['industry_code'] = training_dataset['industry'].astype(
69
+ 'category')
70
+ cat_columns = training_dataset.select_dtypes(['category']).columns
71
+ training_dataset[cat_columns] = training_dataset[cat_columns].apply(
72
+ lambda x: x.cat.codes)
73
+ industry_code_dict = dict(
74
+ zip(training_dataset.industry, training_dataset.industry_code))
75
+ return industry_code_dict
76
+
77
+ def parse_email(uploaded_file):
78
+ parsed_email = []
79
+ efile = open(uploaded_file.name,'r')
80
+ emailstr = ""
81
+ for i, line in enumerate(efile):
82
+ emailstr += line
83
+
84
+ b = email.message_from_string(emailstr)
85
+ for part in b.walk():
86
+ if part.get_content_type():
87
+ body = str(part.get_payload())
88
+ soup = BeautifulSoup(body)
89
+ paragraphs = soup.find_all('body')
90
+ for paragraph in paragraphs:
91
+ parsed_email.append(paragraph.text)
92
+ return parsed_email
93
+
94
+ #def email_upload():
95
+ # print("Please upload your email (In HTML Format)")
96
+ # upload = FileUpload(accept='.html', multiple=True)
97
+ # display(upload)
98
+ # return upload
99
+ # fc = FileChooser()
100
+ # display(fc)
101
+ # return fc
102
+
103
+
104
+ # New - In-Use
105
+ def email_extractor(email_uploaded):
106
+ parse = parse_email(email_uploaded)
107
+
108
+ email_text = ''.join(parse).strip()
109
+
110
+ # extract the email body using string manipulation functions
111
+ email_body_start_index = email_text.find('Bright Apps LLC')
112
+ email_body_end_index = email_text.find('To read more')
113
+ email_body = email_text[email_body_start_index:email_body_end_index].strip()
114
+
115
+ # get rid of non-text elements
116
+ email_body = email_body.replace('\n', '')
117
+ email_body = email_body.replace('\t', '')
118
+ email_body = email_body.replace('\r', '')
119
+ email_body = email_body.replace('</b>', '')
120
+ email_body = email_body.replace('<b>', '')
121
+ email_body = email_body.replace('\xa0', '')
122
+
123
+ # find length of URLs if any
124
+ extractor = URLExtract()
125
+ urls = extractor.find_urls(email_body)
126
+ url_cnt = len(urls)
127
+
128
+ # remove URLs and get character count
129
+ body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_body)
130
+ sep = '©'
131
+ body = body.split(sep, 1)[0]
132
+ character_cnt = sum(not chr.isspace() for chr in body)
133
+
134
+ return email_body, character_cnt, url_cnt
135
+
136
+
137
+ # extract email body from parse email
138
+ def email_body_extractor(email_data):
139
+ # email_data = parsed_email.data[0]
140
+ emailstr = email_data.decode("utf-8")
141
+ b = email.message_from_string(emailstr)
142
+ body = ""
143
+
144
+ if b.is_multipart():
145
+ for part in b.walk():
146
+ ctype = part.get_content_type()
147
+ cdispo = str(part.get('Content-Disposition'))
148
+
149
+ # skip any text/plain (txt) attachments
150
+ if ctype == 'text/plain' and 'attachment' not in cdispo:
151
+ body = part.get_payload() # decode
152
+ break
153
+ # not multipart - i.e. plain text, no attachments, keeping fingers crossed
154
+ else:
155
+ body = b.get_payload()
156
+ # Remove escape sequences
157
+ body = body.replace('\n', '')
158
+ body = body.replace('\t', '')
159
+ body = body.replace('\r', '')
160
+ body = body.replace('</b>', '')
161
+ body = body.replace('<b>', '')
162
+
163
+ # Extract urls in the email body and get url counts
164
+ extractor = URLExtract()
165
+ urls = extractor.find_urls(body)
166
+ url_cnt = len(urls)
167
+ # Remove urls
168
+ body = re.sub(
169
+ r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
170
+ sep = '©'
171
+ body = body.split(sep, 1)[0]
172
+ character_cnt = sum(not chr.isspace() for chr in body)
173
+
174
+ return body, character_cnt, url_cnt
175
+
176
+
177
+ def add_bg_from_url():
178
+ st.markdown(
179
+ f"""
180
+ <style>
181
+ .stApp {{
182
+ background-image: linear-gradient(#45eff5,#1C8D99);
183
+ background-attachment: fixed;
184
+ background-size: cover
185
+
186
+ }}
187
+ </style>
188
+ """,
189
+ unsafe_allow_html=True
190
+ )
191
+
192
+ add_bg_from_url()
193
+ #linear-gradient(0deg,#010405 0,#061c2c 55%,#0a3144 75%,#0f4d60)
194
+
195
+ st.markdown("# Character Count: Email Industry")
196
+
197
+
198
+ stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1, 1, 1, 1])
199
+
200
+ with stats_col1:
201
+ st.caption("Production: Ready")
202
+ with stats_col2:
203
+ st.caption("Accuracy: 85%")
204
+ with stats_col3:
205
+ st.caption("Speed: 16.89 ms")
206
+ with stats_col4:
207
+ st.caption("Industry: Email")
208
+
209
+
210
+ with st.sidebar:
211
+
212
+ with st.expander('Model Description', expanded=False):
213
+ img = PIL.Image.open("figures/ModelCC.png")
214
+ st.image(img)
215
+ st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')
216
+
217
+ with st.expander('Model Information', expanded=False):
218
+ hide_table_row_index = """
219
+ <style>
220
+ thead tr th:first-child {display:none}
221
+ tbody th {display:none}
222
+ </style>
223
+ """
224
+ st.markdown(hide_table_row_index, unsafe_allow_html=True)
225
+ st.table(table_data())
226
+
227
+ url_button('Model Homepage', 'https://www.loxz.com/#/models/CTA')
228
+ # url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
229
+ url_button('Amazon Market Place', 'https://aws.amazon.com/marketplace')
230
+
231
+
232
+ industry_lists = [
233
+ 'Retail',
234
+ 'Software and Technology',
235
+ 'Hospitality',
236
+ 'Academic and Education',
237
+ 'Healthcare',
238
+ 'Energy',
239
+ 'Real Estate',
240
+ 'Entertainment',
241
+ 'Finance and Banking'
242
+ ]
243
+
244
+ campaign_types = [
245
+ 'Promotional',
246
+ 'Transactional',
247
+ 'Webinar',
248
+ 'Survey',
249
+ 'Newsletter',
250
+ 'Engagement',
251
+ 'Usage_and_Consumption',
252
+ 'Review_Request',
253
+ 'Product_Announcement',
254
+ 'Abandoned_Cart'
255
+ ]
256
+
257
+ target_variables = [
258
+ 'conversion_rate',
259
+ 'click_to_open_rate'
260
+ ]
261
+
262
+ uploaded_file = st.file_uploader(
263
+ "Please upload your email (In HTML Format)", type=["html"])
264
+
265
+ if uploaded_file is None:
266
+ # upload_img = PIL.Image.open(uploaded_file)
267
+ upload_img = None
268
+ # else:
269
+ # upload_img = None
270
+
271
+
272
+ industry = st.selectbox(
273
+ 'Please select your industry',
274
+ industry_lists,
275
+ index=6
276
+ )
277
+
278
+ campaign = st.selectbox(
279
+ 'Please select your campaign type',
280
+ campaign_types,
281
+ index=5
282
+ )
283
+
284
+ target = st.selectbox(
285
+ 'Please select your target variable',
286
+ target_variables,
287
+ index=1
288
+ )
289
+
290
+ st.markdown("""---""")
291
+
292
+ #char_reco_preference = st.selectbox(
293
+ # 'Do you want to increase or decrease your character count in the email?',
294
+ # ["Increase", "Decrease"],
295
+ # index=1)
296
+
297
+
298
+ def get_files_from_aws(bucket, prefix):
299
+ """
300
+ get files from aws s3 bucket
301
+ bucket (STRING): bucket name
302
+ prefix (STRING): file location in s3 bucket
303
+ """
304
+ s3_client = boto3.client('s3',
305
+ aws_access_key_id=st.secrets["aws_id"],
306
+ aws_secret_access_key=st.secrets["aws_key"])
307
+
308
+ file_obj = s3_client.get_object(Bucket=bucket, Key=prefix)
309
+ body = file_obj['Body']
310
+ string = body.read().decode('utf-8')
311
+
312
+ df = pd.read_csv(StringIO(string))
313
+
314
+ return df
315
+
316
+
317
+ # st.info([industry,campaign,target,char_reco_preference])
318
+
319
+
320
+ if st.button('Generate Predictions'):
321
+ start_time = time.time()
322
+ if uploaded_file is None:
323
+ st.error('Please upload a email (HTML format)')
324
+ else:
325
+ placeholder = st.empty()
326
+ placeholder.text('Loading Data')
327
+
328
+ # Starting predictions
329
+ model = joblib.load('models/models.sav')
330
+ # Generate Email Data
331
+ email_data = get_files_from_aws(
332
+ 'emailcampaigntrainingdata', 'trainingdata/email_dataset_training.csv')
333
+ acc_data = get_files_from_aws(
334
+ 'emailcampaigntrainingdata', 'trainingdata/email_dataset_training_raw.csv')
335
+
336
+ email_data_ = email_data[["email_body", "industry", "campaign_type",
337
+ "character_cnt", "url_cnt", "Open_Rate", "Click_Through_Rate"]]
338
+ email_data_ = email_data_.rename(
339
+ {'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
340
+ df_email_data = email_data_.rename(
341
+ columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
342
+
343
+ # Dataset:
344
+ training_dataset = get_files_from_aws(
345
+ 'emailcampaigntrainingdata', 'modelCC/training.csv')
346
+ # X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
347
+ # Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')
348
+
349
+ # print("Getting Data Time: %s seconds" % (time.time() - start_time))
350
+
351
+ industry_code_dict = get_industry_code_dict(email_data)
352
+ #uploaded_file = FileChooser(uploaded_file)
353
+ #bytes_data = uploaded_file.getvalue()
354
+
355
+ email_body, character_cnt, url_cnt = email_extractor(uploaded_file)
356
+
357
+ # Start the prediction
358
+ # Need to solve X test issue
359
+
360
+ # y_pred = model.predict(X_test)
361
+ df_uploaded = pd.DataFrame(
362
+ columns=['character_cnt', "url_cnt", "industry"])
363
+ df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
364
+ df_uploaded["industry_code"] = industry_code_dict.get(industry)
365
+ df_uploaded_test = df_uploaded[[
366
+ "industry_code", "character_cnt", "url_cnt"]]
367
+ predicted_rate = model.predict(df_uploaded_test)[0]
368
+ output_rate = round(predicted_rate, 4)
369
+
370
+ if output_rate < 0:
371
+ print(
372
+ "Sorry, Current model couldn't provide predictions on the target variable you selected.")
373
+ else:
374
+ st.markdown('#### Current Character Count in Your Email is: <span style="color:blue">{}</span>'.format(
375
+ character_cnt), unsafe_allow_html=True)
376
+ # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
377
+ if target == 'conversion_rate':
378
+ target_vis = 'Click_Through_Rate'
379
+ else:
380
+ target_vis = 'Open_Rate'
381
+
382
+ st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
383
+ target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
384
+ selected_industry_code = industry_code_dict.get(industry)
385
+
386
+ if target == "click_to_open_rate":
387
+ selected_variable = "Open_Rate"
388
+ if target == "conversion_rate":
389
+ selected_variable = "Click_Through_Rate"
390
+
391
+ df_reco = training_dataset[[
392
+ "industry_code", "character_cnt", "url_cnt", selected_variable]]
393
+ df_reco = df_reco[df_reco["industry_code"]
394
+ == selected_industry_code]
395
+ df_reco[selected_variable] = df_reco[selected_variable].apply(
396
+ lambda x: round(x, 3))
397
+ df_reco_sort = df_reco.sort_values(by=[selected_variable])
398
+ df_reco = df_reco.drop_duplicates(subset=selected_variable)
399
+
400
+ #preference = char_reco_preference
401
+ #if preference == "Increase":
402
+ # df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
403
+ # df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
404
+ # df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
405
+ # decrease character reco
406
+ #if preference == "Decrease":
407
+ # df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
408
+ # df_reco["character_cnt"] < character_cnt)]
409
+ # df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
410
+
411
+
412
+ # split into two dataframes of higher and lower character_cnt (added apr 2023)
413
+ char_cnt_uploaded = character_cnt
414
+
415
+ df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
416
+ df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]
417
+
418
+ # drop duplicates of character_cnt keeping the row with the highest output_rate
419
+ df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
420
+ df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
421
+
422
+ # get top 2 largest in higher and lower dataframe
423
+ df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
424
+ df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])
425
+
426
+ df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
427
+ df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])
428
+
429
+ if selected_variable == "Open_Rate":
430
+ selected_variable = "Click-to-Open_Rate"
431
+ if selected_variable == "Click_Through_Rate":
432
+ selected_variable = "Conversion_Rate"
433
+
434
+ st.markdown('#### To get higher, <span style="color:blue">{}</span>, the model recommends the following options:'.format(
435
+ selected_variable), unsafe_allow_html=True)
436
+ if len(df_reco_opt_rank) == 0:
437
+ st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
438
+ selected_variable), unsafe_allow_html=True)
439
+ else:
440
+ #for _, row in df_reco_opt_rank.iterrows():
441
+ # Character_Count = row[1]
442
+ # selected_variable = row[3]
443
+ # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
444
+ # st.markdown('Number of Characters: {}, Target Rate: {}'.format(
445
+ # int(Character_Count), round(selected_variable*100, 3)))
446
+
447
+ chars = []
448
+ sel_var_values = []
449
+
450
+ for _, row in df_reco_opt_rank.iterrows():
451
+ Character_Count = row[1]
452
+ selected_variable_number = row[3]
453
+ chars.append(int(Character_Count))
454
+ sel_var_values.append(round(selected_variable_number, 3)*100)
455
+ st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
456
+ st.write("\n")
457
+
458
+ if len(chars) > 1:
459
+ #fig = plt.figure()
460
+ #ax = fig.add_axes([0,0,1,1])
461
+ fig, ax = plt.subplots(figsize=(10,4))
462
+ bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')
463
+
464
+ #ax.bar_label(bars)
465
+
466
+ ax.set_yticks(np.arange(len(chars)))
467
+ ax.set_yticklabels(np.array(chars), fontsize=14)
468
+ ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18)
469
+ ax.set_ylabel('Character Counts', fontsize=16)
470
+ ax.set_xlabel('Target Rates %', fontsize=16)
471
+
472
+ for i, bar in enumerate(bars):
473
+ rounded_value = round(sel_var_values[i], 2)
474
+ ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold')
475
+
476
+ ax.margins(0.1,0.05)
477
+
478
+ biggest_bar_index = np.argmax(sel_var_values)
479
+ bars[biggest_bar_index].set_color('#00BF93')
480
+
481
+ st.plotly_chart(fig, use_container_width=True)
482
+
483
+ st.write("\n")
484
+ #st.write(np.array(chars))
485
+ chars_out = dict(zip(chars, sel_var_values))
486
+ sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
487
+
488
+
489
+ placeholder.empty()
490
+ #st.write(time.time() - start_time)
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- transformers
2
- torch
3
  streamlit
4
  altair<5
5
  pickle5
 
1
+ # transformers
2
+ # torch
3
  streamlit
4
  altair<5
5
  pickle5