cd14 commited on
Commit
7fa4ab9
·
1 Parent(s): 220c030

updated email extractor

Browse files
Files changed (1) hide show
  1. app.py +26 -1
app.py CHANGED
@@ -135,6 +135,31 @@ def email_extractor(email_uploaded):
135
 
136
  return email_body, character_cnt, url_cnt
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  # extract email body from parse email
140
  def email_body_extractor(email_data):
@@ -363,7 +388,7 @@ if st.session_state.get('button') == True:
363
  #uploaded_file = FileChooser(uploaded_file)
364
  #bytes_data = uploaded_file.getvalue()
365
 
366
- email_body, character_cnt, url_cnt = email_extractor(uploaded_file)
367
 
368
  # Start the prediction
369
  # Need to solve X test issue
 
135
 
136
  return email_body, character_cnt, url_cnt
137
 
138
+ def email_extractor_general(email_uploaded):
139
+ parse = parse_email(email_uploaded)
140
+ email_text = ''.join(parse).strip()
141
+
142
+ # get rid of non-text elements
143
+ email_text = email_text.replace('\n', '')
144
+ email_text = email_text.replace('\t', '')
145
+ email_text = email_text.replace('\r', '')
146
+ email_text = email_text.replace('</b>', '')
147
+ email_text = email_text.replace('<b>', '')
148
+ email_text = email_text.replace('\xa0', '')
149
+
150
+ # find length of URLs if any
151
+ extractor = URLExtract()
152
+ urls = extractor.find_urls(email_text)
153
+ url_cnt = len(urls)
154
+
155
+ # remove URLs and get character count
156
+ body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_text)
157
+ sep = '©'
158
+ body = body.split(sep, 1)[0]
159
+ character_cnt = sum(not chr.isspace() for chr in body)
160
+
161
+ return email_text, character_cnt, url_cnt
162
+
163
 
164
  # extract email body from parse email
165
  def email_body_extractor(email_data):
 
388
  #uploaded_file = FileChooser(uploaded_file)
389
  #bytes_data = uploaded_file.getvalue()
390
 
391
+ email_body, character_cnt, url_cnt = email_extractor_general(uploaded_file)
392
 
393
  # Start the prediction
394
  # Need to solve X test issue