Spaces:
Running
Running
''' | |
Split a given dataset into three different datasets: training, validation and | |
testing. | |
This is achieved by splitting the given list of sentences into three separate | |
lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an | |
explicit enumeration. The sentences are also tokenised using the given | |
vocabulary. | |
Also splits a given list of dictionaries containing information about | |
each sentence. | |
An additional parameter can be set 'extend_with', which will extend the given | |
vocabulary with up to 'extend_with' tokens, taken from the training dataset. | |
''' | |
from __future__ import print_function, unicode_literals | |
import example_helper | |
import json | |
from torchmoji.sentence_tokenizer import SentenceTokenizer | |
DATASET = [ | |
'I am sentence 0', | |
'I am sentence 1', | |
'I am sentence 2', | |
'I am sentence 3', | |
'I am sentence 4', | |
'I am sentence 5', | |
'I am sentence 6', | |
'I am sentence 7', | |
'I am sentence 8', | |
'I am sentence 9 newword', | |
] | |
INFO_DICTS = [ | |
{'label': 'sentence 0'}, | |
{'label': 'sentence 1'}, | |
{'label': 'sentence 2'}, | |
{'label': 'sentence 3'}, | |
{'label': 'sentence 4'}, | |
{'label': 'sentence 5'}, | |
{'label': 'sentence 6'}, | |
{'label': 'sentence 7'}, | |
{'label': 'sentence 8'}, | |
{'label': 'sentence 9'}, | |
] | |
with open('../model/vocabulary.json', 'r') as f: | |
vocab = json.load(f) | |
st = SentenceTokenizer(vocab, 30) | |
# Split using the default split ratio | |
print(st.split_train_val_test(DATASET, INFO_DICTS)) | |
# Split explicitly | |
print(st.split_train_val_test(DATASET, | |
INFO_DICTS, | |
[[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], | |
extend_with=1)) | |