File size: 1,203 Bytes
d80c106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import numpy as np
s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"
def split(text):
o = []
t = ""
for i in text+" ":
if i in s:
if t != "":
o.append(t)
t = ""
if i != " ":
o.append(i)
t = ""
else:
t += i
return o
def tokenize_2str(text: str):
text = split(text)
o = []
for i in text:
if i[-2:] == "es":
o.append(i[:-2])
o.append("<es>")
else:
o.append(i)
return o
ind2text = ["<NULL>", "<UNK>", "<es>"]
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}
def fit_on_text(text: str):
global ind2text
global text2ind
tokens = tokenize_2str(text)
for i in tokens:
if i not in ind2text:
ind2text.append(i)
text2ind[i] = len(ind2text) - 1
def fit_on_texts(texts):
for text in texts: fit_on_text(text)
def tokenize(text: str):
text = tokenize_2str(text)
o = []
for i in text:
if i in ind2text:
o.append(text2ind[i])
else:
o.append(text2ind['<UNK>'])
return np.array(o)
|