Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /learned_optimizer /optimizer /trainable_optimizer.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

24 kB

	# Copyright 2017 Google, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""A base class definition for trainable optimizers."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import collections
	import itertools

	import tensorflow as tf

	from tensorflow.python.framework import tensor_shape

	OPTIMIZER_SCOPE = "LOL"
	_LOCAL_VARIABLE_PREFIX = "local_state_"
	_LOCAL_STATE_VARIABLE_COLLECTION = "local_state_collection"
	EPSILON = 1e-6


	class TrainableOptimizer(tf.train.Optimizer):
	"""Base class for trainable optimizers.

	A trainable optimizer is an optimizer that has parameters that can themselves
	be learned (meta-optimized).

	Subclasses must implement:
	_compute_update(self, param, grad, state)
	"""

	def __init__(self, name, state_keys, use_attention=False,
	use_log_objective=False, obj_train_max_multiplier=-1,
	use_second_derivatives=True, use_numerator_epsilon=False,
	**kwargs):
	"""Initializes the optimizer with the given name and settings.

	Args:
	name: The name string for this optimizer.
	state_keys: The names of any required state variables (list)
	use_attention: Whether this optimizer uses attention (Default: True)
	use_log_objective: Whether this optimizer uses the logarithm of the
	objective when computing the loss (Default: False)
	obj_train_max_multiplier: The maximum multiplier for the increase in the
	objective before meta-training is stopped. If <= 0, meta-training is
	not stopped early. (Default: -1)
	use_second_derivatives: Whether this optimizer uses second derivatives in
	meta-training. This should be set to False if some second derivatives
	in the meta-training problem set are not defined in Tensorflow.
	(Default: True)
	use_numerator_epsilon: Whether to use epsilon in the numerator when
	scaling the problem objective during meta-training. (Default: False)
	**kwargs: Any additional keyword arguments.
	"""
	self.use_second_derivatives = use_second_derivatives
	self.state_keys = sorted(state_keys)
	self.use_attention = use_attention
	self.use_log_objective = use_log_objective
	self.obj_train_max_multiplier = obj_train_max_multiplier
	self.use_numerator_epsilon = use_numerator_epsilon

	use_locking = False
	super(TrainableOptimizer, self).__init__(use_locking, name)

	def _create_slots(self, var_list):
	"""Creates all slots needed by the variables.

	Args:
	var_list: A list of `Variable` objects.
	"""
	for var in var_list:
	init_states = self._initialize_state(var)
	for slot_name in sorted(init_states):
	slot_var_name = "{}_{}".format(self.get_name(), slot_name)
	value = init_states[slot_name]
	self._get_or_make_slot(var, value, slot_name, slot_var_name)

	def _initialize_state(self, var):
	"""Initializes any state required for this variable.

	Args:
	var: a tensor containing parameters to be optimized

	Returns:
	state: a dictionary mapping state keys to initial state values (tensors)
	"""
	return {}

	def _initialize_global_state(self):
	"""Initializes any global state values."""
	return []

	def _apply_common(self, grad, var):
	"""Applies the optimizer updates to the variables.

	Note: this should only get called via _apply_dense or _apply_sparse when
	using the optimizer via optimizer.minimize or optimizer.apply_gradients.
	During meta-training, the optimizer.train function should be used to
	construct an optimization path that is differentiable.

	Args:
	grad: A tensor representing the gradient.
	var: A tf.Variable with the same shape as grad.

	Returns:
	update_op: A tensorflow op that assigns new values to the variable, and
	also defines dependencies that update the state variables for the
	optimizer.
	"""
	state = {key: self.get_slot(var, key) for key in self.get_slot_names()}
	new_var, new_state = self._compute_update(var, grad, state)
	state_assign_ops = [tf.assign(state_var, new_state[key])
	for key, state_var in state.items()]
	with tf.control_dependencies(state_assign_ops):
	update_op = var.assign(new_var)

	return update_op

	def _apply_dense(self, grad, var):
	"""Adds ops to apply dense gradients to 'var'."""
	return self._apply_common(grad, var)

	def _apply_sparse(self, grad, var):
	"""Adds ops to apply sparse gradients to 'var'."""
	return self._apply_common(grad, var)

	def _compute_update(self, param, grad, state):
	"""Computes the update step for optimization.

	Args:
	param: A tensor of parameters to optimize.
	grad: The gradient tensor of the objective with respect to the parameters.
	(It has the same shape as param.)
	state: A dictionary containing any extra state required by the optimizer.

	Returns:
	updated_params: The updated parameters.
	updated_state: The dictionary of updated state variable(s).
	"""
	raise NotImplementedError

	def _compute_updates(self, params, grads, states, global_state):
	"""Maps the compute update functions for each parameter.

	This function can be overriden by a subclass if the subclass wants to
	combine information across the different parameters in the list.

	Args:
	params: A list of parameter tensors.
	grads: A list of gradients corresponding to each parameter.
	states: A list of state variables corresponding to each parameter.
	global_state: A list of global state variables for the problem.

	Returns:
	new_params: The updated parameters.
	new_states: The updated states.
	new_global_state: The updated global state.
	attention_params: A list of attention parameters. This is the same as
	new_params if the optimizer does not use attention.
	"""
	# Zip up the arguments to _compute_update.
	args = zip(params, grads, states)

	# Call compute_update on each set of parameter/gradient/state args.
	new_params, new_states = zip(*list(
	itertools.starmap(self._compute_update, args)))

	# Global state is unused in the basic case, just pass it through.
	return list(new_params), list(new_states), global_state, list(new_params)

	def train(self, problem, dataset):
	"""Creates graph operations to train the optimizer.

	Args:
	problem: A problem_generator.Problem instance to train on.
	dataset: A datasets.Dataset tuple to use when training.

	Returns:
	meta_objective: A tensorflow operation for computing the meta-objective
	obj_weights: A tensor placeholder for feeding in the objective weights
	obj_values: The subproblem objective values during optimization
	batches: The batch indexes tensor for overriding with feed_dict
	first_unroll: A placeholder signifying if this is a first unroll
	(this will propagate the gradients slightly differently).
	reset_state: A placeholder signifying that the rnn state should be reset.
	output_state: The final state of the optimizer
	init_loop_vars_to_override: Local variables that can be assigned to
	propagate the optimizer and problem state for unrolling
	final_loop_vals: Final values of the loop variables that can be
	assigned to init_loop_vars_to_override.
	"""

	# Placeholder for the objective weights
	obj_weights = tf.placeholder(tf.float32)
	num_iter = tf.shape(obj_weights)[0]

	# Unpack the dataset and generate the minibatches for training
	data, labels = dataset
	# Convert the ndarrays to tensors so we can pass them back in via feed_dict
	data = tf.constant(data)
	labels = tf.constant(labels)
	batches = tf.placeholder(tf.int32)
	first_unroll = tf.placeholder_with_default(False, [])
	reset_state = tf.placeholder_with_default(False, [])

	training_output = collections.namedtuple("TrainingOutput",
	["metaobj",
	"obj_weights",
	"problem_objectives",
	"initial_obj",
	"batches",
	"first_unroll",
	"reset_state",
	"output_state",
	"init_loop_vars",
	"output_loop_vars"])

	def loop_body(itr, obj_accum, params, attend_params, flattened_states,
	global_state, all_obj, unused_init_obj, data,
	labels, batches):
	"""Body of the meta-training while loop for optimizing a sub-problem.

	Args:
	itr: The current meta-training iteration.
	obj_accum: The accumulated objective over all training steps so far.
	params: The parameters of the sub-problem.
	attend_params: The parameters of the sub-problems at the attended
	location.
	flattened_states: The states of the trainable optimizer, sorted and
	flattened into a list (since a while loop can't handle nested lists
	or dictionaries).
	global_state: The global state of the optimizer.
	all_obj: The list of all objective values in the training process.
	unused_init_obj: The initial objective (unused here, but needed in the
	variable list because it's used in a stopping condition in the
	loop_cond.)
	data: The data for this problem.
	labels: The labels corresponding to the data.
	batches: The batch indexes needed for shuffled minibatch creation.

	Returns:
	itr: The updated meta-training iteration.
	obj_accum: The updated accumulated objective.
	params: The new parameters of the sub-problem.
	attend_params: The new parameters of the sub-problems at the attended
	location.
	flattened_states: The new states of the trainable optimizer.
	global_state: The updated global state.
	all_obj: The updates list of all objective values.
	unused_init_obj: The initial objective.
	data: The data for this problem.
	labels: The labels corresponding to the data.
	batches: The batch indexes needed for shuffled minibatch creation.
	"""
	batch_indices = tf.gather(batches, itr)
	batch_data = tf.gather(data, batch_indices)
	batch_labels = tf.gather(labels, batch_indices)

	# Compute the objective over the entire dataset (full batch).
	obj = problem.objective(params, data, labels)

	# Compute the gradients on just the current batch
	if self.use_attention:
	current_obj = problem.objective(attend_params, batch_data, batch_labels)
	grads = problem.gradients(current_obj, attend_params)
	else:
	current_obj = problem.objective(params, batch_data, batch_labels)
	grads = problem.gradients(current_obj, params)

	if not self.use_second_derivatives:
	new_grads = []
	for grad in grads:
	if isinstance(grad, tf.IndexedSlices):
	new_grads.append(
	tf.IndexedSlices(tf.stop_gradient(grad.values), grad.indices))
	else:
	new_grads.append(tf.stop_gradient(grad))
	grads = new_grads

	# store the objective value for the entire problem at each iteration
	all_obj = tf.concat([all_obj, tf.reshape(obj, (1,))], 0)

	# accumulate the weighted objective for the entire dataset
	acc = tf.gather(obj_weights, itr) * obj

	obj_accum = tf.add(obj_accum, acc)
	# Set the shape to keep the shape invariant for obj_accum. Without this,
	# the graph builder thinks the tensor shape is unknown on the 2nd iter.
	obj_accum.set_shape([])

	# convert flattened_states to dictionaries
	dict_states = [dict(zip(self.state_keys, flat_state))
	for flat_state in flattened_states]

	# compute the new parameters and states
	args = (params, grads, dict_states, global_state)
	updates = self._compute_updates(*args)
	new_params, new_states, new_global_state, new_attend_params = updates

	# flatten the states
	new_flattened_states = map(flatten_and_sort, new_states)

	return [itr + 1, obj_accum, new_params, new_attend_params,
	new_flattened_states, new_global_state, all_obj, unused_init_obj,
	data, labels, batches]

	def loop_cond(itr, obj_accum, unused_params, unused_attend_params,
	unused_flattened_states, unused_global_state, all_obj,
	init_obj, *args):
	"""Termination conditions of the sub-problem optimization loop."""
	del args # unused

	cond1 = tf.less(itr, num_iter) # We've run < num_iter times
	cond2 = tf.is_finite(obj_accum) # The objective is still finite

	if self.obj_train_max_multiplier > 0:
	current_obj = tf.gather(all_obj, itr)
	# Account for negative init_obj too
	max_diff = (self.obj_train_max_multiplier - 1) * tf.abs(init_obj)
	max_obj = init_obj + max_diff
	# The objective is a reasonable multiplier of the original objective
	cond3 = tf.less(current_obj, max_obj)

	return tf.logical_and(tf.logical_and(cond1, cond2), cond3,
	name="training_loop_cond")
	else:
	return tf.logical_and(cond1, cond2, name="training_loop_cond")

	init = self._initialize_training_loop_parameters(
	problem, data, labels, batches, first_unroll, reset_state)
	loop_vars, invariants, initial_obj, init_loop_vars_to_override = init

	loop_output = tf.while_loop(loop_cond, loop_body, loop_vars,
	swap_memory=True, shape_invariants=invariants)
	meta_obj, problem_objectives = loop_output[1], loop_output[6]

	# The meta objective is normalized by the initial objective at the start of
	# the series of partial unrolls.
	scaled_meta_objective = self.scale_objective(
	meta_obj, problem_objectives, initial_obj)

	final_loop_vals = (
	[initial_obj] + loop_output[2] + loop_output[3] + loop_output[5])
	final_loop_vals.extend(itertools.chain(*loop_output[4]))

	return training_output(scaled_meta_objective,
	obj_weights,
	problem_objectives,
	initial_obj,
	batches,
	first_unroll,
	reset_state,
	loop_output[4],
	init_loop_vars_to_override,
	final_loop_vals)

	def _initialize_training_loop_parameters(
	self, problem, data, labels, batches, first_unroll, reset_state):
	"""Initializes the vars and params needed for the training process.

	Args:
	problem: The problem being optimized.
	data: The data for the problem.
	labels: The corresponding labels for the data.
	batches: The indexes needed to create shuffled batches of the data.
	first_unroll: Whether this is the first unroll in a partial unrolling.
	reset_state: Whether RNN state variables should be reset.

	Returns:
	loop_vars: The while loop variables for training.
	invariants: The corresponding variable shapes (required by while loop).
	initial_obj: The initial objective (used later for scaling).
	init_loop_vars_to_override: The loop vars that can be overridden when
	performing training via partial unrolls.
	"""
	# Extract these separately so we don't have to make inter-variable
	# dependencies.
	initial_tensors = problem.init_tensors()

	return_initial_tensor_values = first_unroll
	initial_params_vars, initial_params = local_state_variables(
	initial_tensors, return_initial_tensor_values)
	initial_attend_params_vars, initial_attend_params = local_state_variables(
	initial_tensors, return_initial_tensor_values)
	# Recalculate the initial objective for the list on each partial unroll with
	# the new initial_params. initial_obj holds the value from the very first
	# unroll.
	initial_obj_init = problem.objective(initial_params, data, labels)
	return_initial_obj_init = first_unroll
	[initial_obj_var], [initial_obj] = local_state_variables(
	[initial_obj_init], return_initial_obj_init)

	# Initialize the loop variables.
	initial_itr = tf.constant(0, dtype=tf.int32)
	initial_meta_obj = tf.constant(0, dtype=tf.float32)
	# N.B. the use of initial_obj_init here rather than initial_obj
	initial_problem_objectives = tf.reshape(initial_obj_init, (1,))

	# Initialize the extra state.
	initial_state_vars = []
	initial_state = []
	state_shapes = []
	return_initial_state_values = reset_state
	for param in initial_tensors:
	param_state_vars, param_state = local_state_variables(
	flatten_and_sort(self._initialize_state(param)),
	return_initial_state_values)

	initial_state_vars.append(param_state_vars)
	initial_state.append(param_state)
	state_shapes.append([f.get_shape() for f in param_state])

	# Initialize any global (problem-level) state.
	initial_global_state_vars, initial_global_state = local_state_variables(
	self._initialize_global_state(), return_initial_state_values)

	global_shapes = []
	for item in initial_global_state:
	global_shapes.append(item.get_shape())

	# build the list of loop variables:
	loop_vars = [
	initial_itr,
	initial_meta_obj,
	initial_params, # Local variables.
	initial_attend_params, # Local variables.
	initial_state, # Local variables.
	initial_global_state, # Local variables.
	initial_problem_objectives,
	initial_obj, # Local variable.
	data,
	labels,
	batches,
	]

	invariants = [
	initial_itr.get_shape(),
	initial_meta_obj.get_shape(),
	[t.get_shape() for t in initial_params],
	[t.get_shape() for t in initial_attend_params],
	state_shapes,
	global_shapes,
	tensor_shape.TensorShape([None]), # The problem objectives list grows
	initial_obj.get_shape(),
	tensor_shape.unknown_shape(), # Placeholder shapes are unknown
	tensor_shape.unknown_shape(),
	tensor_shape.unknown_shape(),
	]

	# Initialize local variables that we will override with final tensors at the
	# next iter.
	init_loop_vars_to_override = (
	[initial_obj_var] + initial_params_vars + initial_attend_params_vars +
	initial_global_state_vars)
	init_loop_vars_to_override.extend(itertools.chain(*initial_state_vars))

	return loop_vars, invariants, initial_obj, init_loop_vars_to_override

	def scale_objective(self, total_obj, all_objs, initial_obj,
	obj_scale_eps=1e-6):
	"""Normalizes the objective based on the initial objective value.

	Args:
	total_obj: The total accumulated objective over the training run.
	all_objs: A list of all the individual objectives over the training run.
	initial_obj: The initial objective value.
	obj_scale_eps: The epsilon value to use in computations for stability.

	Returns:
	The scaled objective as a single value.
	"""
	if self.use_log_objective:
	if self.use_numerator_epsilon:
	scaled_problem_obj = ((all_objs + obj_scale_eps) /
	(initial_obj + obj_scale_eps))
	log_scaled_problem_obj = tf.log(scaled_problem_obj)
	else:
	scaled_problem_obj = all_objs / (initial_obj + obj_scale_eps)
	log_scaled_problem_obj = tf.log(scaled_problem_obj + obj_scale_eps)
	return tf.reduce_mean(log_scaled_problem_obj)
	else:
	return total_obj / (initial_obj + obj_scale_eps)


	def local_state_variables(init_values, return_init_values):
	"""Create local variables initialized from init_values.

	This will create local variables from a list of init_values. Each variable
	will be named based on the value's shape and dtype.

	As a convenience, a boolean tensor allows you to return value from
	the created local variable or from the original init value.

	Args:
	init_values: iterable of tensors
	return_init_values: boolean tensor

	Returns:
	local_vars: list of the created local variables.
	vals: if return_init_values is true, then this returns the values of
	init_values. Otherwise it returns the values of the local_vars.
	"""
	if not init_values:
	return [], []

	# This generates a harmless warning when saving the metagraph.
	variable_use_count = tf.get_collection_ref(_LOCAL_STATE_VARIABLE_COLLECTION)
	if not variable_use_count:
	variable_use_count.append(collections.defaultdict(int))
	variable_use_count = variable_use_count[0]

	local_vars = []
	with tf.variable_scope(OPTIMIZER_SCOPE):
	# We can't use the init_value as an initializer as init_value may
	# itself depend on some problem variables. This would produce
	# inter-variable initialization order dependence which TensorFlow
	# sucks at making easy.
	for init_value in init_values:
	name = create_local_state_variable_name(init_value)
	unique_name = name + "_" + str(variable_use_count[name])
	variable_use_count[name] += 1
	# The overarching idea here is to be able to reuse variables between
	# different sessions on the same TensorFlow master without errors. By
	# uniquifying based on the type and name we mirror the checks made inside
	# TensorFlow, while still allowing some memory reuse. Ultimately this is a
	# hack due to the broken Session.reset().
	local_vars.append(
	tf.get_local_variable(
	unique_name,
	initializer=tf.zeros(
	init_value.get_shape(), dtype=init_value.dtype)))

	# It makes things a lot simpler if we use the init_value the first
	# iteration, instead of the variable itself. It allows us to propagate
	# gradients through it as well as simplifying initialization. The variable
	# ends up assigned to after the first iteration.
	vals = tf.cond(return_init_values, lambda: init_values, lambda: local_vars)
	if len(init_values) == 1:
	# tf.cond extracts elements from singleton lists.
	vals = [vals]
	return local_vars, vals


	def create_local_state_variable_name(tensor):
	"""Create a name of the variable based on its type and shape."""
	if not tensor.get_shape().is_fully_defined():
	raise ValueError("Need a fully specified shape to create a local variable.")

	return (_LOCAL_VARIABLE_PREFIX + "_".join(
	map(str, tensor.get_shape().as_list())) + "_" + tensor.dtype.name)


	def is_local_state_variable(op):
	"""Returns if this op is a local state variable created for training."""
	return op.node_def.op in ["Variable", "VariableV2"] and op.name.startswith(
	OPTIMIZER_SCOPE + "/" + _LOCAL_VARIABLE_PREFIX)


	def flatten_and_sort(dictionary):
	"""Flattens a dictionary into a list of values sorted by the keys."""
	return [dictionary[k] for k in sorted(dictionary.keys())]