Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
''' | |
Provides functionality for converting a given list of tokens (words) into | |
numbers, according to the given vocabulary. | |
''' | |
from __future__ import print_function, division, unicode_literals | |
import numbers | |
import numpy as np | |
from torchmoji.create_vocab import extend_vocab, VocabBuilder | |
from torchmoji.word_generator import WordGenerator | |
from torchmoji.global_variables import SPECIAL_TOKENS | |
# import torch | |
from sklearn.model_selection import train_test_split | |
from copy import deepcopy | |
class SentenceTokenizer(): | |
""" Create numpy array of tokens corresponding to input sentences. | |
The vocabulary can include Unicode tokens. | |
""" | |
def __init__(self, vocabulary, fixed_length, custom_wordgen=None, | |
ignore_sentences_with_only_custom=False, masking_value=0, | |
unknown_value=1): | |
""" Needs a dictionary as input for the vocabulary. | |
""" | |
if len(vocabulary) > np.iinfo('uint16').max: | |
raise ValueError('Dictionary is too big ({} tokens) for the numpy ' | |
'datatypes used (max limit={}). Reduce vocabulary' | |
' or adjust code accordingly!' | |
.format(len(vocabulary), np.iinfo('uint16').max)) | |
# Shouldn't be able to modify the given vocabulary | |
self.vocabulary = deepcopy(vocabulary) | |
self.fixed_length = fixed_length | |
self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom | |
self.masking_value = masking_value | |
self.unknown_value = unknown_value | |
# Initialized with an empty stream of sentences that must then be fed | |
# to the generator at a later point for reusability. | |
# A custom word generator can be used for domain-specific filtering etc | |
if custom_wordgen is not None: | |
assert custom_wordgen.stream is None | |
self.wordgen = custom_wordgen | |
self.uses_custom_wordgen = True | |
else: | |
self.wordgen = WordGenerator(None, allow_unicode_text=True, | |
ignore_emojis=False, | |
remove_variation_selectors=True, | |
break_replacement=True) | |
self.uses_custom_wordgen = False | |
def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None): | |
""" Converts a given list of sentences into a numpy array according to | |
its vocabulary. | |
# Arguments: | |
sentences: List of sentences to be tokenized. | |
reset_stats: Whether the word generator's stats should be reset. | |
max_sentences: Maximum length of sentences. Must be set if the | |
length cannot be inferred from the input. | |
# Returns: | |
Numpy array of the tokenization sentences with masking, | |
infos, | |
stats | |
# Raises: | |
ValueError: When maximum length is not set and cannot be inferred. | |
""" | |
if max_sentences is None and not hasattr(sentences, '__len__'): | |
raise ValueError('Either you must provide an array with a length' | |
'attribute (e.g. a list) or specify the maximum ' | |
'length yourself using `max_sentences`!') | |
n_sentences = (max_sentences if max_sentences is not None | |
else len(sentences)) | |
if self.masking_value == 0: | |
tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16') | |
else: | |
tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16') | |
* self.masking_value) | |
if reset_stats: | |
self.wordgen.reset_stats() | |
# With a custom word generator info can be extracted from each | |
# sentence (e.g. labels) | |
infos = [] | |
# Returns words as strings and then map them to vocabulary | |
self.wordgen.stream = sentences | |
next_insert = 0 | |
n_ignored_unknowns = 0 | |
for s_words, s_info in self.wordgen: | |
s_tokens = self.find_tokens(s_words) | |
if (self.ignore_sentences_with_only_custom and | |
np.all([True if t < len(SPECIAL_TOKENS) | |
else False for t in s_tokens])): | |
n_ignored_unknowns += 1 | |
continue | |
if len(s_tokens) > self.fixed_length: | |
s_tokens = s_tokens[:self.fixed_length] | |
tokens[next_insert,:len(s_tokens)] = s_tokens | |
infos.append(s_info) | |
next_insert += 1 | |
# For standard word generators all sentences should be tokenized | |
# this is not necessarily the case for custom wordgenerators as they | |
# may filter the sentences etc. | |
if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom: | |
assert len(sentences) == next_insert | |
else: | |
# adjust based on actual tokens received | |
tokens = tokens[:next_insert] | |
infos = infos[:next_insert] | |
return tokens, infos, self.wordgen.stats | |
def find_tokens(self, words): | |
assert len(words) > 0 | |
tokens = [] | |
for w in words: | |
try: | |
tokens.append(self.vocabulary[w]) | |
except KeyError: | |
tokens.append(self.unknown_value) | |
return tokens | |
def split_train_val_test(self, sentences, info_dicts, | |
split_parameter=[0.7, 0.1, 0.2], extend_with=0): | |
""" Splits given sentences into three different datasets: training, | |
validation and testing. | |
# Arguments: | |
sentences: The sentences to be tokenized. | |
info_dicts: A list of dicts that contain information about each | |
sentence (e.g. a label). | |
split_parameter: A parameter for deciding the splits between the | |
three different datasets. If instead of being passed three | |
values, three lists are passed, then these will be used to | |
specify which observation belong to which dataset. | |
extend_with: An optional parameter. If > 0 then this is the number | |
of tokens added to the vocabulary from this dataset. The | |
expanded vocab will be generated using only the training set, | |
but is applied to all three sets. | |
# Returns: | |
List of three lists of tokenized sentences, | |
List of three corresponding dictionaries with information, | |
How many tokens have been added to the vocab. Make sure to extend | |
the embedding layer of the model accordingly. | |
""" | |
# If passed three lists, use those directly | |
if isinstance(split_parameter, list) and \ | |
all(isinstance(x, list) for x in split_parameter) and \ | |
len(split_parameter) == 3: | |
# Helper function to verify provided indices are numbers in range | |
def verify_indices(inds): | |
return list(filter(lambda i: isinstance(i, numbers.Number) | |
and i < len(sentences), inds)) | |
ind_train = verify_indices(split_parameter[0]) | |
ind_val = verify_indices(split_parameter[1]) | |
ind_test = verify_indices(split_parameter[2]) | |
else: | |
# Split sentences and dicts | |
ind = list(range(len(sentences))) | |
ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) | |
ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) | |
# Map indices to data | |
train = np.array([sentences[x] for x in ind_train]) | |
test = np.array([sentences[x] for x in ind_test]) | |
val = np.array([sentences[x] for x in ind_val]) | |
info_train = np.array([info_dicts[x] for x in ind_train]) | |
info_test = np.array([info_dicts[x] for x in ind_test]) | |
info_val = np.array([info_dicts[x] for x in ind_val]) | |
added = 0 | |
# Extend vocabulary with training set tokens | |
if extend_with > 0: | |
wg = WordGenerator(train) | |
vb = VocabBuilder(wg) | |
vb.count_all_words() | |
added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) | |
# Wrap results | |
result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] | |
result_infos = [info_train, info_val, info_test] | |
# if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]: | |
# result_infos = [torch.from_numpy(label).long() for label in result_infos] | |
return result, result_infos, added | |
def to_sentence(self, sentence_idx): | |
""" Converts a tokenized sentence back to a list of words. | |
# Arguments: | |
sentence_idx: List of numbers, representing a tokenized sentence | |
given the current vocabulary. | |
# Returns: | |
String created by converting all numbers back to words and joined | |
together with spaces. | |
""" | |
# Have to recalculate the mappings in case the vocab was extended. | |
ind_to_word = {ind: word for word, ind in self.vocabulary.items()} | |
sentence_as_list = [ind_to_word[x] for x in sentence_idx] | |
cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK'] | |
return " ".join(cleaned_list) | |
def coverage(dataset, verbose=False): | |
""" Computes the percentage of words in a given dataset that are unknown. | |
# Arguments: | |
dataset: Tokenized dataset to be checked. | |
verbose: Verbosity flag. | |
# Returns: | |
Percentage of unknown tokens. | |
""" | |
n_total = np.count_nonzero(dataset) | |
n_unknown = np.sum(dataset == 1) | |
coverage = 1.0 - float(n_unknown) / n_total | |
if verbose: | |
print("Unknown words: {}".format(n_unknown)) | |
print("Total words: {}".format(n_total)) | |
print("Coverage: {}".format(coverage)) | |
return coverage | |