Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /learned_optimizer /metarun.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

19 kB

	# Copyright 2017 Google, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Scripts for meta-optimization."""

	from __future__ import print_function

	import os

	import tensorflow as tf

	import metaopt
	from learned_optimizer.optimizer import coordinatewise_rnn
	from learned_optimizer.optimizer import global_learning_rate
	from learned_optimizer.optimizer import hierarchical_rnn
	from learned_optimizer.optimizer import learning_rate_schedule
	from learned_optimizer.optimizer import trainable_adam
	from learned_optimizer.problems import problem_sets as ps
	from learned_optimizer.problems import problem_spec

	tf.app.flags.DEFINE_string("train_dir", "/tmp/lol/",
	"""Directory to store parameters and results.""")

	tf.app.flags.DEFINE_integer("task", 0,
	"""Task id of the replica running the training.""")
	tf.app.flags.DEFINE_integer("worker_tasks", 1,
	"""Number of tasks in the worker job.""")

	tf.app.flags.DEFINE_integer("num_problems", 1000,
	"""Number of sub-problems to run.""")
	tf.app.flags.DEFINE_integer("num_meta_iterations", 5,
	"""Number of meta-iterations to optimize.""")
	tf.app.flags.DEFINE_integer("num_unroll_scale", 40,
	"""The scale parameter of the exponential
	distribution from which the number of partial
	unrolls is drawn""")
	tf.app.flags.DEFINE_integer("min_num_unrolls", 1,
	"""The minimum number of unrolls per problem.""")
	tf.app.flags.DEFINE_integer("num_partial_unroll_itr_scale", 200,
	"""The scale parameter of the exponential
	distribution from which the number of iterations
	per unroll is drawn.""")
	tf.app.flags.DEFINE_integer("min_num_itr_partial_unroll", 50,
	"""The minimum number of iterations for one
	unroll.""")

	tf.app.flags.DEFINE_string("optimizer", "HierarchicalRNN",
	"""Which meta-optimizer to train.""")

	# CoordinatewiseRNN-specific flags
	tf.app.flags.DEFINE_integer("cell_size", 20,
	"""Size of the RNN hidden state in each layer.""")
	tf.app.flags.DEFINE_integer("num_cells", 2,
	"""Number of RNN layers.""")
	tf.app.flags.DEFINE_string("cell_cls", "GRUCell",
	"""Type of RNN cell to use.""")

	# Metaoptimization parameters
	tf.app.flags.DEFINE_float("meta_learning_rate", 1e-6,
	"""The learning rate for the meta-optimizer.""")
	tf.app.flags.DEFINE_float("gradient_clip_level", 1e4,
	"""The level to clip gradients to.""")

	# Training set selection
	tf.app.flags.DEFINE_boolean("include_quadratic_problems", False,
	"""Include non-noisy quadratic problems.""")
	tf.app.flags.DEFINE_boolean("include_noisy_quadratic_problems", True,
	"""Include noisy quadratic problems.""")
	tf.app.flags.DEFINE_boolean("include_large_quadratic_problems", True,
	"""Include very large quadratic problems.""")
	tf.app.flags.DEFINE_boolean("include_bowl_problems", True,
	"""Include 2D bowl problems.""")
	tf.app.flags.DEFINE_boolean("include_softmax_2_class_problems", True,
	"""Include 2-class logistic regression problems.""")
	tf.app.flags.DEFINE_boolean("include_noisy_softmax_2_class_problems", True,
	"""Include noisy 2-class logistic regression
	problems.""")
	tf.app.flags.DEFINE_boolean("include_optimization_test_problems", True,
	"""Include non-noisy versions of classic
	optimization test problems, e.g. Rosenbrock.""")
	tf.app.flags.DEFINE_boolean("include_noisy_optimization_test_problems", True,
	"""Include gradient-noise versions of classic
	optimization test problems, e.g. Rosenbrock""")
	tf.app.flags.DEFINE_boolean("include_fully_connected_random_2_class_problems",
	True, """Include MLP problems for 2 classes.""")
	tf.app.flags.DEFINE_boolean("include_matmul_problems", True,
	"""Include matrix multiplication problems.""")
	tf.app.flags.DEFINE_boolean("include_log_objective_problems", True,
	"""Include problems where the objective is the log
	objective of another problem, e.g. Bowl.""")
	tf.app.flags.DEFINE_boolean("include_rescale_problems", True,
	"""Include problems where the parameters are scaled
	version of the original parameters.""")
	tf.app.flags.DEFINE_boolean("include_norm_problems", True,
	"""Include problems where the objective is the
	N-norm of another problem, e.g. Quadratic.""")
	tf.app.flags.DEFINE_boolean("include_sum_problems", True,
	"""Include problems where the objective is the sum
	of the objectives of the subproblems that make
	up the problem parameters. Per-problem tensors
	are still independent of each other.""")
	tf.app.flags.DEFINE_boolean("include_sparse_gradient_problems", True,
	"""Include problems where the gradient is set to 0
	with some high probability.""")
	tf.app.flags.DEFINE_boolean("include_sparse_softmax_problems", False,
	"""Include sparse softmax problems.""")
	tf.app.flags.DEFINE_boolean("include_one_hot_sparse_softmax_problems", False,
	"""Include one-hot sparse softmax problems.""")
	tf.app.flags.DEFINE_boolean("include_noisy_bowl_problems", True,
	"""Include noisy bowl problems.""")
	tf.app.flags.DEFINE_boolean("include_noisy_norm_problems", True,
	"""Include noisy norm problems.""")
	tf.app.flags.DEFINE_boolean("include_noisy_sum_problems", True,
	"""Include noisy sum problems.""")
	tf.app.flags.DEFINE_boolean("include_sum_of_quadratics_problems", False,
	"""Include sum of quadratics problems.""")
	tf.app.flags.DEFINE_boolean("include_projection_quadratic_problems", False,
	"""Include projection quadratic problems.""")
	tf.app.flags.DEFINE_boolean("include_outward_snake_problems", False,
	"""Include outward snake problems.""")
	tf.app.flags.DEFINE_boolean("include_dependency_chain_problems", False,
	"""Include dependency chain problems.""")
	tf.app.flags.DEFINE_boolean("include_min_max_well_problems", False,
	"""Include min-max well problems.""")

	# Optimizer parameters: initialization and scale values
	tf.app.flags.DEFINE_float("min_lr", 1e-6,
	"""The minimum initial learning rate.""")
	tf.app.flags.DEFINE_float("max_lr", 1e-2,
	"""The maximum initial learning rate.""")

	# Optimizer parameters: small features.
	tf.app.flags.DEFINE_boolean("zero_init_lr_weights", True,
	"""Whether to initialize the learning rate weights
	to 0 rather than the scaled random initialization
	used for other RNN variables.""")
	tf.app.flags.DEFINE_boolean("use_relative_lr", True,
	"""Whether to use the relative learning rate as an
	input during training. Can only be used if
	learnable_decay is also True.""")
	tf.app.flags.DEFINE_boolean("use_extreme_indicator", False,
	"""Whether to use the extreme indicator for learning
	rates as an input during training. Can only be
	used if learnable_decay is also True.""")
	tf.app.flags.DEFINE_boolean("use_log_means_squared", True,
	"""Whether to track the log of the mean squared
	grads instead of the means squared grads.""")
	tf.app.flags.DEFINE_boolean("use_problem_lr_mean", True,
	"""Whether to use the mean over all learning rates
	in the problem when calculating the relative
	learning rate.""")

	# Optimizer parameters: major features
	tf.app.flags.DEFINE_boolean("learnable_decay", True,
	"""Whether to learn weights that dynamically
	modulate the input scale via RMS decay.""")
	tf.app.flags.DEFINE_boolean("dynamic_output_scale", True,
	"""Whether to learn weights that dynamically
	modulate the output scale.""")
	tf.app.flags.DEFINE_boolean("use_log_objective", True,
	"""Whether to use the log of the scaled objective
	rather than just the scaled obj for training.""")
	tf.app.flags.DEFINE_boolean("use_attention", False,
	"""Whether to learn where to attend.""")
	tf.app.flags.DEFINE_boolean("use_second_derivatives", True,
	"""Whether to use second derivatives.""")
	tf.app.flags.DEFINE_integer("num_gradient_scales", 4,
	"""How many different timescales to keep for
	gradient history. If > 1, also learns a scale
	factor for gradient history.""")
	tf.app.flags.DEFINE_float("max_log_lr", 33,
	"""The maximum log learning rate allowed.""")
	tf.app.flags.DEFINE_float("objective_training_max_multiplier", -1,
	"""How much the objective can grow before training on
	this problem / param pair is terminated. Sets a max
	on the objective value when multiplied by the
	initial objective. If <= 0, not used.""")
	tf.app.flags.DEFINE_boolean("use_gradient_shortcut", True,
	"""Whether to add a learned affine projection of the
	gradient to the update delta in addition to the
	gradient function computed by the RNN.""")
	tf.app.flags.DEFINE_boolean("use_lr_shortcut", False,
	"""Whether to add the difference between the current
	learning rate and the desired learning rate to
	the RNN input.""")
	tf.app.flags.DEFINE_boolean("use_grad_products", True,
	"""Whether to use gradient products in the input to
	the RNN. Only applicable when num_gradient_scales
	> 1.""")
	tf.app.flags.DEFINE_boolean("use_multiple_scale_decays", False,
	"""Whether to use many-timescale scale decays.""")
	tf.app.flags.DEFINE_boolean("use_numerator_epsilon", False,
	"""Whether to use epsilon in the numerator of the
	log objective.""")
	tf.app.flags.DEFINE_boolean("learnable_inp_decay", True,
	"""Whether to learn input decay weight and bias.""")
	tf.app.flags.DEFINE_boolean("learnable_rnn_init", True,
	"""Whether to learn RNN state initialization.""")

	FLAGS = tf.app.flags.FLAGS

	# The Size of the RNN hidden state in each layer:
	# [PerParam, PerTensor, Global]. The length of this list must be 1, 2, or 3.
	# If less than 3, the Global and/or PerTensor RNNs will not be created.

	HRNN_CELL_SIZES = [10, 20, 20]



	def register_optimizers():
	opts = {}
	opts["CoordinatewiseRNN"] = coordinatewise_rnn.CoordinatewiseRNN
	opts["GlobalLearningRate"] = global_learning_rate.GlobalLearningRate
	opts["HierarchicalRNN"] = hierarchical_rnn.HierarchicalRNN
	opts["LearningRateSchedule"] = learning_rate_schedule.LearningRateSchedule
	opts["TrainableAdam"] = trainable_adam.TrainableAdam
	return opts


	def main(unused_argv):
	"""Runs the main script."""

	opts = register_optimizers()

	# Choose a set of problems to optimize. By default this includes quadratics,
	# 2-dimensional bowls, 2-class softmax problems, and non-noisy optimization
	# test problems (e.g. Rosenbrock, Beale)
	problems_and_data = []

	if FLAGS.include_sparse_softmax_problems:
	problems_and_data.extend(ps.sparse_softmax_2_class_sparse_problems())

	if FLAGS.include_one_hot_sparse_softmax_problems:
	problems_and_data.extend(
	ps.one_hot_sparse_softmax_2_class_sparse_problems())

	if FLAGS.include_quadratic_problems:
	problems_and_data.extend(ps.quadratic_problems())

	if FLAGS.include_noisy_quadratic_problems:
	problems_and_data.extend(ps.quadratic_problems_noisy())

	if FLAGS.include_large_quadratic_problems:
	problems_and_data.extend(ps.quadratic_problems_large())

	if FLAGS.include_bowl_problems:
	problems_and_data.extend(ps.bowl_problems())

	if FLAGS.include_noisy_bowl_problems:
	problems_and_data.extend(ps.bowl_problems_noisy())

	if FLAGS.include_softmax_2_class_problems:
	problems_and_data.extend(ps.softmax_2_class_problems())

	if FLAGS.include_noisy_softmax_2_class_problems:
	problems_and_data.extend(ps.softmax_2_class_problems_noisy())

	if FLAGS.include_optimization_test_problems:
	problems_and_data.extend(ps.optimization_test_problems())

	if FLAGS.include_noisy_optimization_test_problems:
	problems_and_data.extend(ps.optimization_test_problems_noisy())

	if FLAGS.include_fully_connected_random_2_class_problems:
	problems_and_data.extend(ps.fully_connected_random_2_class_problems())

	if FLAGS.include_matmul_problems:
	problems_and_data.extend(ps.matmul_problems())

	if FLAGS.include_log_objective_problems:
	problems_and_data.extend(ps.log_objective_problems())

	if FLAGS.include_rescale_problems:
	problems_and_data.extend(ps.rescale_problems())

	if FLAGS.include_norm_problems:
	problems_and_data.extend(ps.norm_problems())

	if FLAGS.include_noisy_norm_problems:
	problems_and_data.extend(ps.norm_problems_noisy())

	if FLAGS.include_sum_problems:
	problems_and_data.extend(ps.sum_problems())

	if FLAGS.include_noisy_sum_problems:
	problems_and_data.extend(ps.sum_problems_noisy())

	if FLAGS.include_sparse_gradient_problems:
	problems_and_data.extend(ps.sparse_gradient_problems())
	if FLAGS.include_fully_connected_random_2_class_problems:
	problems_and_data.extend(ps.sparse_gradient_problems_mlp())

	if FLAGS.include_min_max_well_problems:
	problems_and_data.extend(ps.min_max_well_problems())

	if FLAGS.include_sum_of_quadratics_problems:
	problems_and_data.extend(ps.sum_of_quadratics_problems())

	if FLAGS.include_projection_quadratic_problems:
	problems_and_data.extend(ps.projection_quadratic_problems())

	if FLAGS.include_outward_snake_problems:
	problems_and_data.extend(ps.outward_snake_problems())

	if FLAGS.include_dependency_chain_problems:
	problems_and_data.extend(ps.dependency_chain_problems())

	# log directory
	logdir = os.path.join(FLAGS.train_dir,
	"{}_{}_{}_{}".format(FLAGS.optimizer,
	FLAGS.cell_cls,
	FLAGS.cell_size,
	FLAGS.num_cells))

	# get the optimizer class and arguments
	optimizer_cls = opts[FLAGS.optimizer]

	assert len(HRNN_CELL_SIZES) in [1, 2, 3]
	optimizer_args = (HRNN_CELL_SIZES,)

	optimizer_kwargs = {
	"init_lr_range": (FLAGS.min_lr, FLAGS.max_lr),
	"learnable_decay": FLAGS.learnable_decay,
	"dynamic_output_scale": FLAGS.dynamic_output_scale,
	"cell_cls": getattr(tf.contrib.rnn, FLAGS.cell_cls),
	"use_attention": FLAGS.use_attention,
	"use_log_objective": FLAGS.use_log_objective,
	"num_gradient_scales": FLAGS.num_gradient_scales,
	"zero_init_lr_weights": FLAGS.zero_init_lr_weights,
	"use_log_means_squared": FLAGS.use_log_means_squared,
	"use_relative_lr": FLAGS.use_relative_lr,
	"use_extreme_indicator": FLAGS.use_extreme_indicator,
	"max_log_lr": FLAGS.max_log_lr,
	"obj_train_max_multiplier": FLAGS.objective_training_max_multiplier,
	"use_problem_lr_mean": FLAGS.use_problem_lr_mean,
	"use_gradient_shortcut": FLAGS.use_gradient_shortcut,
	"use_second_derivatives": FLAGS.use_second_derivatives,
	"use_lr_shortcut": FLAGS.use_lr_shortcut,
	"use_grad_products": FLAGS.use_grad_products,
	"use_multiple_scale_decays": FLAGS.use_multiple_scale_decays,
	"use_numerator_epsilon": FLAGS.use_numerator_epsilon,
	"learnable_inp_decay": FLAGS.learnable_inp_decay,
	"learnable_rnn_init": FLAGS.learnable_rnn_init,
	}
	optimizer_spec = problem_spec.Spec(
	optimizer_cls, optimizer_args, optimizer_kwargs)

	# make log directory
	tf.gfile.MakeDirs(logdir)

	is_chief = FLAGS.task == 0
	# if this is a distributed run, make the chief run through problems in order
	select_random_problems = FLAGS.worker_tasks == 1 or not is_chief

	def num_unrolls():
	return metaopt.sample_numiter(FLAGS.num_unroll_scale, FLAGS.min_num_unrolls)

	def num_partial_unroll_itrs():
	return metaopt.sample_numiter(FLAGS.num_partial_unroll_itr_scale,
	FLAGS.min_num_itr_partial_unroll)

	# run it
	metaopt.train_optimizer(
	logdir,
	optimizer_spec,
	problems_and_data,
	FLAGS.num_problems,
	FLAGS.num_meta_iterations,
	num_unrolls,
	num_partial_unroll_itrs,
	learning_rate=FLAGS.meta_learning_rate,
	gradient_clip=FLAGS.gradient_clip_level,
	is_chief=is_chief,
	select_random_problems=select_random_problems,
	obj_train_max_multiplier=FLAGS.objective_training_max_multiplier,
	callbacks=[])

	return 0


	if __name__ == "__main__":
	tf.app.run()