File size: 1,203 Bytes

d80c106

import numpy as np

s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"

def split(text):
    o = []
    t = ""
    for i in text+" ":
        if i in s:
            if t != "":
                o.append(t)
                t = ""
            if i != " ":
                o.append(i)
                t = ""
        else:
            t += i
    return o

def tokenize_2str(text: str):
    text = split(text)

    o = []

    for i in text:
        if i[-2:] == "es":
            o.append(i[:-2])
            o.append("<es>")
        else:
            o.append(i)
    return o

ind2text = ["<NULL>", "<UNK>", "<es>"]
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}

def fit_on_text(text: str):
    global ind2text
    global text2ind
    tokens = tokenize_2str(text)
    for i in tokens:
        if i not in ind2text:
            ind2text.append(i)
            text2ind[i] = len(ind2text) - 1

def fit_on_texts(texts):
    for text in texts: fit_on_text(text)

def tokenize(text: str):
    text = tokenize_2str(text)

    o = []

    for i in text:
        if i in ind2text:
            o.append(text2ind[i])
        else:
            o.append(text2ind['<UNK>'])
    return np.array(o)