Spaces:
Build error
Build error
File size: 3,393 Bytes
f3772cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from fastcore.basics import listify
from fastcore.utils import compose
import unicodedata
from string import punctuation
import html
from itertools import groupby
import re
control_char_regex = re.compile(r'[\r\n\t]+')
url_regex = re.compile(
r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
username_regex = re.compile(r'(^|[^@\w])@(\w{1,15})\b')
def fix_html(text):
tmp_ls = []
for e in listify(text):
e = e.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace('nbsp;', ' ').replace(
'#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace('<br />', "\n").replace(
'\\"', '"').replace('<unk>', ' ').replace(' @.@ ', '.').replace(' @-@ ', '-').replace('...', ' β¦')
tmp_ls.append(html.unescape(e))
text = tmp_ls
return text
def remove_control_char(text):
tmp_ls = []
for e in listify(text):
tmp_ls.append(re.sub(control_char_regex, '.', e))
text = tmp_ls
return text
def remove_remaining_control_chars(text):
tmp_ls = []
for e in listify(text):
tmp_ls.append(
''.join(ch for ch in e if unicodedata.category(ch)[0] != 'C'))
text = tmp_ls
return text
def remove_unicode_symbols(text):
tmp_ls = []
for e in listify(text):
tmp_ls.append(
''.join(ch for ch in e if unicodedata.category(ch)[0] != 'So'))
text = tmp_ls
return text
def standardise_punc(text):
transl_table = dict([(ord(x), ord(y))
for x, y in zip(u"ββΒ΄βββ-", u"'''\"\"--")])
tmp_ls = []
for e in listify(text):
e = e.translate(transl_table)
tmp_ls.append(e)
text = tmp_ls
return text
def remove_news_tags(text):
tmp_ls = []
for e in listify(text):
e = re.sub(r"(<[A-Z].+?>)|(</[A-Z].+?>)", "", e)
tmp_ls.append(e)
text = tmp_ls
return text
def replace_urls(text):
filler, tmp_ls = '', []
for e in listify(text):
e = re.sub(r"(<a.+?>)|(</a>)|(<ref.+?>)", "", e)
e = re.sub(url_regex, filler, e)
tmp_ls.append(e)
text = tmp_ls
return text
def replace_usernames(text):
filler, tmp_ls = '', []
for e in listify(text):
occ = e.count('@')
for _ in range(occ):
e = e.replace('@<user>', f'{filler}')
# replace other user handles by filler
e = re.sub(username_regex, filler, e)
tmp_ls.append(e)
text = tmp_ls
return text
def remove_duplicate_punctuation(text):
tmp_ls = []
for e in listify(text):
e = re.sub(r'\b(\w+)( \1\b)+', r'\1', e)
punc = set(punctuation)
newtext = []
for k, g in groupby(e):
if k in punc:
newtext.append(k)
else:
newtext.extend(g)
e = ''.join(newtext)
tmp_ls.append(e)
text = tmp_ls
return text
def remove_multi_space(text):
tmp_ls = []
for e in listify(text):
tmp_ls.append(' '.join(e.split()))
text = tmp_ls
return text
clean_text_funcs = compose(*[fix_html, remove_control_char, remove_remaining_control_chars, remove_unicode_symbols,
standardise_punc, remove_news_tags, replace_urls, replace_usernames, remove_duplicate_punctuation, remove_multi_space])
|