File size: 1,203 Bytes
d80c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np

s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"

def split(text):
    o = []
    t = ""
    for i in text+" ":
        if i in s:
            if t != "":
                o.append(t)
                t = ""
            if i != " ":
                o.append(i)
                t = ""
        else:
            t += i
    return o

def tokenize_2str(text: str):
    text = split(text)

    o = []

    for i in text:
        if i[-2:] == "es":
            o.append(i[:-2])
            o.append("<es>")
        else:
            o.append(i)
    return o

ind2text = ["<NULL>", "<UNK>", "<es>"]
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}

def fit_on_text(text: str):
    global ind2text
    global text2ind
    tokens = tokenize_2str(text)
    for i in tokens:
        if i not in ind2text:
            ind2text.append(i)
            text2ind[i] = len(ind2text) - 1

def fit_on_texts(texts):
    for text in texts: fit_on_text(text)

def tokenize(text: str):
    text = tokenize_2str(text)

    o = []

    for i in text:
        if i in ind2text:
            o.append(text2ind[i])
        else:
            o.append(text2ind['<UNK>'])
    return np.array(o)