Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /learned_optimizer /problems /datasets.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

7.4 kB

	# Copyright 2017 Google, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Functions to generate or load datasets for supervised learning."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	from collections import namedtuple

	import numpy as np
	from sklearn.datasets import make_classification

	MAX_SEED = 4294967295


	class Dataset(namedtuple("Dataset", "data labels")):
	"""Helper class for managing a supervised learning dataset.

	Args:
	data: an array of type float32 with N samples, each of which is the set
	of features for that sample. (Shape (N, D_i), where N is the number of
	samples and D_i is the number of features for that sample.)
	labels: an array of type int32 or int64 with N elements, indicating the
	class label for the corresponding set of features in data.
	"""
	# Since this is an immutable object, we don't need to reserve slots.
	__slots__ = ()

	@property
	def size(self):
	"""Dataset size (number of samples)."""
	return len(self.data)

	def batch_indices(self, num_batches, batch_size):
	"""Creates indices of shuffled minibatches.

	Args:
	num_batches: the number of batches to generate
	batch_size: the size of each batch

	Returns:
	batch_indices: a list of minibatch indices, arranged so that the dataset
	is randomly shuffled.

	Raises:
	ValueError: if the data and labels have different lengths
	"""
	if len(self.data) != len(self.labels):
	raise ValueError("Labels and data must have the same number of samples.")

	batch_indices = []

	# Follows logic in mnist.py to ensure we cover the entire dataset.
	index_in_epoch = 0
	dataset_size = len(self.data)
	dataset_indices = np.arange(dataset_size)
	np.random.shuffle(dataset_indices)

	for _ in range(num_batches):
	start = index_in_epoch
	index_in_epoch += batch_size
	if index_in_epoch > dataset_size:

	# Finished epoch, reshuffle.
	np.random.shuffle(dataset_indices)

	# Start next epoch.
	start = 0
	index_in_epoch = batch_size

	end = index_in_epoch
	batch_indices.append(dataset_indices[start:end].tolist())

	return batch_indices


	def noisy_parity_class(n_samples,
	n_classes=2,
	n_context_ids=5,
	noise_prob=0.25,
	random_seed=None):
	"""Returns a randomly generated sparse-to-sparse dataset.

	The label is a parity class of a set of context classes.

	Args:
	n_samples: number of samples (data points)
	n_classes: number of class labels (default: 2)
	n_context_ids: how many classes to take the parity of (default: 5).
	noise_prob: how often to corrupt the label (default: 0.25)
	random_seed: seed used for drawing the random data (default: None)
	Returns:
	dataset: A Dataset namedtuple containing the generated data and labels
	"""
	np.random.seed(random_seed)
	x = np.random.randint(0, n_classes, [n_samples, n_context_ids])
	noise = np.random.binomial(1, noise_prob, [n_samples])
	y = (np.sum(x, 1) + noise) % n_classes
	return Dataset(x.astype("float32"), y.astype("int32"))


	def random(n_features, n_samples, n_classes=2, sep=1.0, random_seed=None):
	"""Returns a randomly generated classification dataset.

	Args:
	n_features: number of features (dependent variables)
	n_samples: number of samples (data points)
	n_classes: number of class labels (default: 2)
	sep: separation of the two classes, a higher value corresponds to
	an easier classification problem (default: 1.0)
	random_seed: seed used for drawing the random data (default: None)

	Returns:
	dataset: A Dataset namedtuple containing the generated data and labels
	"""
	# Generate the problem data.
	x, y = make_classification(n_samples=n_samples,
	n_features=n_features,
	n_informative=n_features,
	n_redundant=0,
	n_classes=n_classes,
	class_sep=sep,
	random_state=random_seed)

	return Dataset(x.astype("float32"), y.astype("int32"))


	def random_binary(n_features, n_samples, random_seed=None):
	"""Returns a randomly generated dataset of binary values.

	Args:
	n_features: number of features (dependent variables)
	n_samples: number of samples (data points)
	random_seed: seed used for drawing the random data (default: None)

	Returns:
	dataset: A Dataset namedtuple containing the generated data and labels
	"""
	random_seed = (np.random.randint(MAX_SEED) if random_seed is None
	else random_seed)
	np.random.seed(random_seed)

	x = np.random.randint(2, size=(n_samples, n_features))
	y = np.zeros((n_samples, 1))

	return Dataset(x.astype("float32"), y.astype("int32"))


	def random_symmetric(n_features, n_samples, random_seed=None):
	"""Returns a randomly generated dataset of values and their negatives.

	Args:
	n_features: number of features (dependent variables)
	n_samples: number of samples (data points)
	random_seed: seed used for drawing the random data (default: None)

	Returns:
	dataset: A Dataset namedtuple containing the generated data and labels
	"""
	random_seed = (np.random.randint(MAX_SEED) if random_seed is None
	else random_seed)
	np.random.seed(random_seed)

	x1 = np.random.normal(size=(int(n_samples/2), n_features))
	x = np.concatenate((x1, -x1), axis=0)
	y = np.zeros((n_samples, 1))

	return Dataset(x.astype("float32"), y.astype("int32"))


	def random_mlp(n_features, n_samples, random_seed=None, n_layers=6, width=20):
	"""Returns a generated output of an MLP with random weights.

	Args:
	n_features: number of features (dependent variables)
	n_samples: number of samples (data points)
	random_seed: seed used for drawing the random data (default: None)
	n_layers: number of layers in random MLP
	width: width of the layers in random MLP

	Returns:
	dataset: A Dataset namedtuple containing the generated data and labels
	"""
	random_seed = (np.random.randint(MAX_SEED) if random_seed is None
	else random_seed)
	np.random.seed(random_seed)

	x = np.random.normal(size=(n_samples, n_features))
	y = x
	n_in = n_features
	scale_factor = np.sqrt(2.) / np.sqrt(n_features)
	for _ in range(n_layers):
	weights = np.random.normal(size=(n_in, width)) * scale_factor
	y = np.dot(y, weights).clip(min=0)
	n_in = width

	y = y[:, 0]
	y[y > 0] = 1

	return Dataset(x.astype("float32"), y.astype("int32"))


	EMPTY_DATASET = Dataset(np.array([], dtype="float32"),
	np.array([], dtype="int32"))