saied commited on
Commit
ad582b6
1 Parent(s): 79fa2a7

some modification in preprocessing/urls removing

Browse files
Files changed (1) hide show
  1. src/data_utils.py +4 -9
src/data_utils.py CHANGED
@@ -46,19 +46,14 @@ def clean_url(text):
46
  result = result.replace(' ', '')
47
  result = result.split(':')
48
  for phrase in result:
49
- p = phrase.replace(' ', '')
50
- # text = text.replace(p, "")
51
- if "/ /" or "//" in p:
52
- if ('https :' + p) or ('https:' + p) in text:
53
  text = text.replace('https :' + p, '')
54
- text = text.replace('https:' + p, '')
55
- elif ('http :' + p) or ('http:' + p) in text:
56
  text = text.replace('http :' + p, '')
57
- text = text.replace('http:' + p, '')
58
  elif '@' in p:
59
  if p in text:
60
  text = text.replace(p, '')
61
- else:
62
- text = text.replace(p, "")
63
 
64
  return text
 
46
  result = result.replace(' ', '')
47
  result = result.split(':')
48
  for phrase in result:
49
+ p = phrase
50
+ if '/ /' in p:
51
+ if ('https :' + p) in text:
 
52
  text = text.replace('https :' + p, '')
53
+ elif ('http :' + p) in text:
 
54
  text = text.replace('http :' + p, '')
 
55
  elif '@' in p:
56
  if p in text:
57
  text = text.replace(p, '')
 
 
58
 
59
  return text