|
import numpy as np |
|
import json |
|
|
|
class Tokenizer(object): |
|
def __init__(self, char_level, num_tokens=None, |
|
pad_token="<PAD>", oov_token="<UNK>", |
|
token_to_index=None): |
|
self.char_level = char_level |
|
self.separator = "" if self.char_level else " " |
|
if num_tokens: num_tokens -= 2 |
|
self.num_tokens = num_tokens |
|
self.pad_token = pad_token |
|
self.oov_token = oov_token |
|
if not token_to_index: |
|
token_to_index = {pad_token: 0, oov_token: 1} |
|
self.token_to_index = token_to_index |
|
self.index_to_token = {v: k for k, v in self.token_to_index.items()} |
|
|
|
def __len__(self): |
|
return len(self.token_to_index) |
|
|
|
def __str__(self): |
|
return f"<Tokenizer(num_tokens={len(self)})>" |
|
|
|
def fit_on_texts(self, texts): |
|
if not self.char_level: |
|
texts = [text.split(" ") for text in texts] |
|
all_tokens = [token for text in texts for token in text] |
|
counts = Counter(all_tokens).most_common(self.num_tokens) |
|
self.min_token_freq = counts[-1][1] |
|
for token, count in counts: |
|
index = len(self) |
|
self.token_to_index[token] = index |
|
self.index_to_token[index] = token |
|
return self |
|
|
|
def texts_to_sequences(self, texts): |
|
sequences = [] |
|
for text in texts: |
|
if not self.char_level: |
|
text = text.split(" ") |
|
sequence = [] |
|
for token in text: |
|
sequence.append(self.token_to_index.get( |
|
token, self.token_to_index[self.oov_token])) |
|
sequences.append(np.asarray(sequence)) |
|
return sequences |
|
|
|
def sequences_to_texts(self, sequences): |
|
texts = [] |
|
for sequence in sequences: |
|
text = [] |
|
for index in sequence: |
|
text.append(self.index_to_token.get(index, self.oov_token)) |
|
texts.append(self.separator.join([token for token in text])) |
|
return texts |
|
|
|
def save(self, fp): |
|
with open(fp, "w") as fp: |
|
contents = { |
|
"char_level": self.char_level, |
|
"oov_token": self.oov_token, |
|
"token_to_index": self.token_to_index |
|
} |
|
json.dump(contents, fp, indent=4, sort_keys=False) |
|
|
|
@classmethod |
|
def load(cls, fp): |
|
with open(fp, "r") as fp: |
|
kwargs = json.load(fp=fp) |
|
return cls(**kwargs) |
|
|