Spaces:

Pendrokar
/

DeepMoji

Running

DeepMoji / examples /dataset_split.py

torchmoji code

86a83a2 10 months ago

1.71 kB

	'''
	Split a given dataset into three different datasets: training, validation and
	testing.

	This is achieved by splitting the given list of sentences into three separate
	lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
	explicit enumeration. The sentences are also tokenised using the given
	vocabulary.

	Also splits a given list of dictionaries containing information about
	each sentence.

	An additional parameter can be set 'extend_with', which will extend the given
	vocabulary with up to 'extend_with' tokens, taken from the training dataset.
	'''
	from __future__ import print_function, unicode_literals
	import example_helper
	import json

	from torchmoji.sentence_tokenizer import SentenceTokenizer

	DATASET = [
	'I am sentence 0',
	'I am sentence 1',
	'I am sentence 2',
	'I am sentence 3',
	'I am sentence 4',
	'I am sentence 5',
	'I am sentence 6',
	'I am sentence 7',
	'I am sentence 8',
	'I am sentence 9 newword',
	]

	INFO_DICTS = [
	{'label': 'sentence 0'},
	{'label': 'sentence 1'},
	{'label': 'sentence 2'},
	{'label': 'sentence 3'},
	{'label': 'sentence 4'},
	{'label': 'sentence 5'},
	{'label': 'sentence 6'},
	{'label': 'sentence 7'},
	{'label': 'sentence 8'},
	{'label': 'sentence 9'},
	]

	with open('../model/vocabulary.json', 'r') as f:
	vocab = json.load(f)
	st = SentenceTokenizer(vocab, 30)

	# Split using the default split ratio
	print(st.split_train_val_test(DATASET, INFO_DICTS))

	# Split explicitly
	print(st.split_train_val_test(DATASET,
	INFO_DICTS,
	[[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
	extend_with=1))