NCTCMumbai's picture
Upload 2571 files
0b8359d
raw
history blame
94.6 kB
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""
LFADS - Latent Factor Analysis via Dynamical Systems.
LFADS is an unsupervised method to decompose time series data into
various factors, such as an initial condition, a generative
dynamical system, control inputs to that generator, and a low
dimensional description of the observed data, called the factors.
Additionally, the observations have a noise model (in this case
Poisson), so a denoised version of the observations is also created
(e.g. underlying rates of a Poisson distribution given the observed
event counts).
The main data structure being passed around is a dataset. This is a dictionary
of data dictionaries.
DATASET: The top level dictionary is simply name (string -> dictionary).
The nested dictionary is the DATA DICTIONARY, which has the following keys:
'train_data' and 'valid_data', whose values are the corresponding training
and validation data with shape
ExTxD, E - # examples, T - # time steps, D - # dimensions in data.
The data dictionary also has a few more keys:
'train_ext_input' and 'valid_ext_input', if there are know external inputs
to the system being modeled, these take on dimensions:
ExTxI, E - # examples, T - # time steps, I = # dimensions in input.
'alignment_matrix_cxf' - If you are using multiple days data, it's possible
that one can align the channels (see manuscript). If so each dataset will
contain this matrix, which will be used for both the input adapter and the
output adapter for each dataset. These matrices, if provided, must be of
size [data_dim x factors] where data_dim is the number of neurons recorded
on that day, and factors is chosen and set through the '--factors' flag.
'alignment_bias_c' - See alignment_matrix_cxf. This bias will used to
the offset for the alignment transformation. It will *subtract* off the
bias from the data, so pca style inits can align factors across sessions.
If one runs LFADS on data where the true rates are known for some trials,
(say simulated, testing data, as in the example shipped with the paper), then
one can add three more fields for plotting purposes. These are 'train_truth'
and 'valid_truth', and 'conversion_factor'. These have the same dimensions as
'train_data', and 'valid_data' but represent the underlying rates of the
observations. Finally, if one needs to convert scale for plotting the true
underlying firing rates, there is the 'conversion_factor' key.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import tensorflow as tf
from distributions import LearnableDiagonalGaussian, DiagonalGaussianFromInput
from distributions import diag_gaussian_log_likelihood
from distributions import KLCost_GaussianGaussian, Poisson
from distributions import LearnableAutoRegressive1Prior
from distributions import KLCost_GaussianGaussianProcessSampled
from utils import init_linear, linear, list_t_bxn_to_tensor_bxtxn, write_data
from utils import log_sum_exp, flatten
from plot_lfads import plot_lfads
class GRU(object):
"""Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
"""
def __init__(self, num_units, forget_bias=1.0, weight_scale=1.0,
clip_value=np.inf, collections=None):
"""Create a GRU object.
Args:
num_units: Number of units in the GRU
forget_bias (optional): Hack to help learning.
weight_scale (optional): weights are scaled by ws/sqrt(#inputs), with
ws being the weight scale.
clip_value (optional): if the recurrent values grow above this value,
clip them.
collections (optional): List of additonal collections variables should
belong to.
"""
self._num_units = num_units
self._forget_bias = forget_bias
self._weight_scale = weight_scale
self._clip_value = clip_value
self._collections = collections
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
@property
def state_multiplier(self):
return 1
def output_from_state(self, state):
"""Return the output portion of the state."""
return state
def __call__(self, inputs, state, scope=None):
"""Gated recurrent unit (GRU) function.
Args:
inputs: A 2D batch x input_dim tensor of inputs.
state: The previous state from the last time step.
scope (optional): TF variable scope for defined GRU variables.
Returns:
A tuple (state, state), where state is the newly computed state at time t.
It is returned twice to respect an interface that works for LSTMs.
"""
x = inputs
h = state
if inputs is not None:
xh = tf.concat(axis=1, values=[x, h])
else:
xh = h
with tf.variable_scope(scope or type(self).__name__): # "GRU"
with tf.variable_scope("Gates"): # Reset gate and update gate.
# We start with bias of 1.0 to not reset and not update.
r, u = tf.split(axis=1, num_or_size_splits=2, value=linear(xh,
2 * self._num_units,
alpha=self._weight_scale,
name="xh_2_ru",
collections=self._collections))
r, u = tf.sigmoid(r), tf.sigmoid(u + self._forget_bias)
with tf.variable_scope("Candidate"):
xrh = tf.concat(axis=1, values=[x, r * h])
c = tf.tanh(linear(xrh, self._num_units, name="xrh_2_c",
collections=self._collections))
new_h = u * h + (1 - u) * c
new_h = tf.clip_by_value(new_h, -self._clip_value, self._clip_value)
return new_h, new_h
class GenGRU(object):
"""Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
This version is specialized for the generator, but isn't as fast, so
we have two. Note this allows for l2 regularization on the recurrent
weights, but also implicitly rescales the inputs via the 1/sqrt(input)
scaling in the linear helper routine to be large magnitude, if there are
fewer inputs than recurrent state.
"""
def __init__(self, num_units, forget_bias=1.0,
input_weight_scale=1.0, rec_weight_scale=1.0, clip_value=np.inf,
input_collections=None, recurrent_collections=None):
"""Create a GRU object.
Args:
num_units: Number of units in the GRU
forget_bias (optional): Hack to help learning.
input_weight_scale (optional): weights are scaled ws/sqrt(#inputs), with
ws being the weight scale.
rec_weight_scale (optional): weights are scaled ws/sqrt(#inputs),
with ws being the weight scale.
clip_value (optional): if the recurrent values grow above this value,
clip them.
input_collections (optional): List of additonal collections variables
that input->rec weights should belong to.
recurrent_collections (optional): List of additonal collections variables
that rec->rec weights should belong to.
"""
self._num_units = num_units
self._forget_bias = forget_bias
self._input_weight_scale = input_weight_scale
self._rec_weight_scale = rec_weight_scale
self._clip_value = clip_value
self._input_collections = input_collections
self._rec_collections = recurrent_collections
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
@property
def state_multiplier(self):
return 1
def output_from_state(self, state):
"""Return the output portion of the state."""
return state
def __call__(self, inputs, state, scope=None):
"""Gated recurrent unit (GRU) function.
Args:
inputs: A 2D batch x input_dim tensor of inputs.
state: The previous state from the last time step.
scope (optional): TF variable scope for defined GRU variables.
Returns:
A tuple (state, state), where state is the newly computed state at time t.
It is returned twice to respect an interface that works for LSTMs.
"""
x = inputs
h = state
with tf.variable_scope(scope or type(self).__name__): # "GRU"
with tf.variable_scope("Gates"): # Reset gate and update gate.
# We start with bias of 1.0 to not reset and not update.
r_x = u_x = 0.0
if x is not None:
r_x, u_x = tf.split(axis=1, num_or_size_splits=2, value=linear(x,
2 * self._num_units,
alpha=self._input_weight_scale,
do_bias=False,
name="x_2_ru",
normalized=False,
collections=self._input_collections))
r_h, u_h = tf.split(axis=1, num_or_size_splits=2, value=linear(h,
2 * self._num_units,
do_bias=True,
alpha=self._rec_weight_scale,
name="h_2_ru",
collections=self._rec_collections))
r = r_x + r_h
u = u_x + u_h
r, u = tf.sigmoid(r), tf.sigmoid(u + self._forget_bias)
with tf.variable_scope("Candidate"):
c_x = 0.0
if x is not None:
c_x = linear(x, self._num_units, name="x_2_c", do_bias=False,
alpha=self._input_weight_scale,
normalized=False,
collections=self._input_collections)
c_rh = linear(r*h, self._num_units, name="rh_2_c", do_bias=True,
alpha=self._rec_weight_scale,
collections=self._rec_collections)
c = tf.tanh(c_x + c_rh)
new_h = u * h + (1 - u) * c
new_h = tf.clip_by_value(new_h, -self._clip_value, self._clip_value)
return new_h, new_h
class LFADS(object):
"""LFADS - Latent Factor Analysis via Dynamical Systems.
LFADS is an unsupervised method to decompose time series data into
various factors, such as an initial condition, a generative
dynamical system, inferred inputs to that generator, and a low
dimensional description of the observed data, called the factors.
Additoinally, the observations have a noise model (in this case
Poisson), so a denoised version of the observations is also created
(e.g. underlying rates of a Poisson distribution given the observed
event counts).
"""
def __init__(self, hps, kind="train", datasets=None):
"""Create an LFADS model.
train - a model for training, sampling of posteriors is used
posterior_sample_and_average - sample from the posterior, this is used
for evaluating the expected value of the outputs of LFADS, given a
specific input, by averaging over multiple samples from the approx
posterior. Also used for the lower bound on the negative
log-likelihood using IWAE error (Importance Weighed Auto-encoder).
This is the denoising operation.
prior_sample - a model for generation - sampling from priors is used
Args:
hps: The dictionary of hyper parameters.
kind: the type of model to build (see above).
datasets: a dictionary of named data_dictionaries, see top of lfads.py
"""
print("Building graph...")
all_kinds = ['train', 'posterior_sample_and_average', 'posterior_push_mean',
'prior_sample']
assert kind in all_kinds, 'Wrong kind'
if hps.feedback_factors_or_rates == "rates":
assert len(hps.dataset_names) == 1, \
"Multiple datasets not supported for rate feedback."
num_steps = hps.num_steps
ic_dim = hps.ic_dim
co_dim = hps.co_dim
ext_input_dim = hps.ext_input_dim
cell_class = GRU
gen_cell_class = GenGRU
def makelambda(v): # Used with tf.case
return lambda: v
# Define the data placeholder, and deal with all parts of the graph
# that are dataset dependent.
self.dataName = tf.placeholder(tf.string, shape=())
# The batch_size to be inferred from data, as normal.
# Additionally, the data_dim will be inferred as well, allowing for a
# single placeholder for all datasets, regardless of data dimension.
if hps.output_dist == 'poisson':
# Enforce correct dtype
assert np.issubdtype(
datasets[hps.dataset_names[0]]['train_data'].dtype, int), \
"Data dtype must be int for poisson output distribution"
data_dtype = tf.int32
elif hps.output_dist == 'gaussian':
assert np.issubdtype(
datasets[hps.dataset_names[0]]['train_data'].dtype, float), \
"Data dtype must be float for gaussian output dsitribution"
data_dtype = tf.float32
else:
assert False, "NIY"
self.dataset_ph = dataset_ph = tf.placeholder(data_dtype,
[None, num_steps, None],
name="data")
self.train_step = tf.get_variable("global_step", [], tf.int64,
tf.zeros_initializer(),
trainable=False)
self.hps = hps
ndatasets = hps.ndatasets
factors_dim = hps.factors_dim
self.preds = preds = [None] * ndatasets
self.fns_in_fac_Ws = fns_in_fac_Ws = [None] * ndatasets
self.fns_in_fatcor_bs = fns_in_fac_bs = [None] * ndatasets
self.fns_out_fac_Ws = fns_out_fac_Ws = [None] * ndatasets
self.fns_out_fac_bs = fns_out_fac_bs = [None] * ndatasets
self.datasetNames = dataset_names = hps.dataset_names
self.ext_inputs = ext_inputs = None
if len(dataset_names) == 1: # single session
if 'alignment_matrix_cxf' in datasets[dataset_names[0]].keys():
used_in_factors_dim = factors_dim
in_identity_if_poss = False
else:
used_in_factors_dim = hps.dataset_dims[dataset_names[0]]
in_identity_if_poss = True
else: # multisession
used_in_factors_dim = factors_dim
in_identity_if_poss = False
for d, name in enumerate(dataset_names):
data_dim = hps.dataset_dims[name]
in_mat_cxf = None
in_bias_1xf = None
align_bias_1xc = None
if datasets and 'alignment_matrix_cxf' in datasets[name].keys():
dataset = datasets[name]
if hps.do_train_readin:
print("Initializing trainable readin matrix with alignment matrix" \
" provided for dataset:", name)
else:
print("Setting non-trainable readin matrix to alignment matrix" \
" provided for dataset:", name)
in_mat_cxf = dataset['alignment_matrix_cxf'].astype(np.float32)
if in_mat_cxf.shape != (data_dim, factors_dim):
raise ValueError("""Alignment matrix must have dimensions %d x %d
(data_dim x factors_dim), but currently has %d x %d."""%
(data_dim, factors_dim, in_mat_cxf.shape[0],
in_mat_cxf.shape[1]))
if datasets and 'alignment_bias_c' in datasets[name].keys():
dataset = datasets[name]
if hps.do_train_readin:
print("Initializing trainable readin bias with alignment bias " \
"provided for dataset:", name)
else:
print("Setting non-trainable readin bias to alignment bias " \
"provided for dataset:", name)
align_bias_c = dataset['alignment_bias_c'].astype(np.float32)
align_bias_1xc = np.expand_dims(align_bias_c, axis=0)
if align_bias_1xc.shape[1] != data_dim:
raise ValueError("""Alignment bias must have dimensions %d
(data_dim), but currently has %d."""%
(data_dim, in_mat_cxf.shape[0]))
if in_mat_cxf is not None and align_bias_1xc is not None:
# (data - alignment_bias) * W_in
# data * W_in - alignment_bias * W_in
# So b = -alignment_bias * W_in to accommodate PCA style offset.
in_bias_1xf = -np.dot(align_bias_1xc, in_mat_cxf)
if hps.do_train_readin:
# only add to IO transformations collection only if we want it to be
# learnable, because IO_transformations collection will be trained
# when do_train_io_only
collections_readin=['IO_transformations']
else:
collections_readin=None
in_fac_lin = init_linear(data_dim, used_in_factors_dim,
do_bias=True,
mat_init_value=in_mat_cxf,
bias_init_value=in_bias_1xf,
identity_if_possible=in_identity_if_poss,
normalized=False, name="x_2_infac_"+name,
collections=collections_readin,
trainable=hps.do_train_readin)
in_fac_W, in_fac_b = in_fac_lin
fns_in_fac_Ws[d] = makelambda(in_fac_W)
fns_in_fac_bs[d] = makelambda(in_fac_b)
with tf.variable_scope("glm"):
out_identity_if_poss = False
if len(dataset_names) == 1 and \
factors_dim == hps.dataset_dims[dataset_names[0]]:
out_identity_if_poss = True
for d, name in enumerate(dataset_names):
data_dim = hps.dataset_dims[name]
in_mat_cxf = None
if datasets and 'alignment_matrix_cxf' in datasets[name].keys():
dataset = datasets[name]
in_mat_cxf = dataset['alignment_matrix_cxf'].astype(np.float32)
if datasets and 'alignment_bias_c' in datasets[name].keys():
dataset = datasets[name]
align_bias_c = dataset['alignment_bias_c'].astype(np.float32)
align_bias_1xc = np.expand_dims(align_bias_c, axis=0)
out_mat_fxc = None
out_bias_1xc = None
if in_mat_cxf is not None:
out_mat_fxc = in_mat_cxf.T
if align_bias_1xc is not None:
out_bias_1xc = align_bias_1xc
if hps.output_dist == 'poisson':
out_fac_lin = init_linear(factors_dim, data_dim, do_bias=True,
mat_init_value=out_mat_fxc,
bias_init_value=out_bias_1xc,
identity_if_possible=out_identity_if_poss,
normalized=False,
name="fac_2_logrates_"+name,
collections=['IO_transformations'])
out_fac_W, out_fac_b = out_fac_lin
elif hps.output_dist == 'gaussian':
out_fac_lin_mean = \
init_linear(factors_dim, data_dim, do_bias=True,
mat_init_value=out_mat_fxc,
bias_init_value=out_bias_1xc,
normalized=False,
name="fac_2_means_"+name,
collections=['IO_transformations'])
out_fac_W_mean, out_fac_b_mean = out_fac_lin_mean
mat_init_value = np.zeros([factors_dim, data_dim]).astype(np.float32)
bias_init_value = np.ones([1, data_dim]).astype(np.float32)
out_fac_lin_logvar = \
init_linear(factors_dim, data_dim, do_bias=True,
mat_init_value=mat_init_value,
bias_init_value=bias_init_value,
normalized=False,
name="fac_2_logvars_"+name,
collections=['IO_transformations'])
out_fac_W_mean, out_fac_b_mean = out_fac_lin_mean
out_fac_W_logvar, out_fac_b_logvar = out_fac_lin_logvar
out_fac_W = tf.concat(
axis=1, values=[out_fac_W_mean, out_fac_W_logvar])
out_fac_b = tf.concat(
axis=1, values=[out_fac_b_mean, out_fac_b_logvar])
else:
assert False, "NIY"
preds[d] = tf.equal(tf.constant(name), self.dataName)
data_dim = hps.dataset_dims[name]
fns_out_fac_Ws[d] = makelambda(out_fac_W)
fns_out_fac_bs[d] = makelambda(out_fac_b)
pf_pairs_in_fac_Ws = zip(preds, fns_in_fac_Ws)
pf_pairs_in_fac_bs = zip(preds, fns_in_fac_bs)
pf_pairs_out_fac_Ws = zip(preds, fns_out_fac_Ws)
pf_pairs_out_fac_bs = zip(preds, fns_out_fac_bs)
this_in_fac_W = tf.case(pf_pairs_in_fac_Ws, exclusive=True)
this_in_fac_b = tf.case(pf_pairs_in_fac_bs, exclusive=True)
this_out_fac_W = tf.case(pf_pairs_out_fac_Ws, exclusive=True)
this_out_fac_b = tf.case(pf_pairs_out_fac_bs, exclusive=True)
# External inputs (not changing by dataset, by definition).
if hps.ext_input_dim > 0:
self.ext_input = tf.placeholder(tf.float32,
[None, num_steps, ext_input_dim],
name="ext_input")
else:
self.ext_input = None
ext_input_bxtxi = self.ext_input
self.keep_prob = keep_prob = tf.placeholder(tf.float32, [], "keep_prob")
self.batch_size = batch_size = int(hps.batch_size)
self.learning_rate = tf.Variable(float(hps.learning_rate_init),
trainable=False, name="learning_rate")
self.learning_rate_decay_op = self.learning_rate.assign(
self.learning_rate * hps.learning_rate_decay_factor)
# Dropout the data.
dataset_do_bxtxd = tf.nn.dropout(tf.to_float(dataset_ph), keep_prob)
if hps.ext_input_dim > 0:
ext_input_do_bxtxi = tf.nn.dropout(ext_input_bxtxi, keep_prob)
else:
ext_input_do_bxtxi = None
# ENCODERS
def encode_data(dataset_bxtxd, enc_cell, name, forward_or_reverse,
num_steps_to_encode):
"""Encode data for LFADS
Args:
dataset_bxtxd - the data to encode, as a 3 tensor, with dims
time x batch x data dims.
enc_cell: encoder cell
name: name of encoder
forward_or_reverse: string, encode in forward or reverse direction
num_steps_to_encode: number of steps to encode, 0:num_steps_to_encode
Returns:
encoded data as a list with num_steps_to_encode items, in order
"""
if forward_or_reverse == "forward":
dstr = "_fwd"
time_fwd_or_rev = range(num_steps_to_encode)
else:
dstr = "_rev"
time_fwd_or_rev = reversed(range(num_steps_to_encode))
with tf.variable_scope(name+"_enc"+dstr, reuse=False):
enc_state = tf.tile(
tf.Variable(tf.zeros([1, enc_cell.state_size]),
name=name+"_enc_t0"+dstr), tf.stack([batch_size, 1]))
enc_state.set_shape([None, enc_cell.state_size]) # tile loses shape
enc_outs = [None] * num_steps_to_encode
for i, t in enumerate(time_fwd_or_rev):
with tf.variable_scope(name+"_enc"+dstr, reuse=True if i > 0 else None):
dataset_t_bxd = dataset_bxtxd[:,t,:]
in_fac_t_bxf = tf.matmul(dataset_t_bxd, this_in_fac_W) + this_in_fac_b
in_fac_t_bxf.set_shape([None, used_in_factors_dim])
if ext_input_dim > 0 and not hps.inject_ext_input_to_gen:
ext_input_t_bxi = ext_input_do_bxtxi[:,t,:]
enc_input_t_bxfpe = tf.concat(
axis=1, values=[in_fac_t_bxf, ext_input_t_bxi])
else:
enc_input_t_bxfpe = in_fac_t_bxf
enc_out, enc_state = enc_cell(enc_input_t_bxfpe, enc_state)
enc_outs[t] = enc_out
return enc_outs
# Encode initial condition means and variances
# ([x_T, x_T-1, ... x_0] and [x_0, x_1, ... x_T] -> g0/c0)
self.ic_enc_fwd = [None] * num_steps
self.ic_enc_rev = [None] * num_steps
if ic_dim > 0:
enc_ic_cell = cell_class(hps.ic_enc_dim,
weight_scale=hps.cell_weight_scale,
clip_value=hps.cell_clip_value)
ic_enc_fwd = encode_data(dataset_do_bxtxd, enc_ic_cell,
"ic", "forward",
hps.num_steps_for_gen_ic)
ic_enc_rev = encode_data(dataset_do_bxtxd, enc_ic_cell,
"ic", "reverse",
hps.num_steps_for_gen_ic)
self.ic_enc_fwd = ic_enc_fwd
self.ic_enc_rev = ic_enc_rev
# Encoder control input means and variances, bi-directional encoding so:
# ([x_T, x_T-1, ..., x_0] and [x_0, x_1 ... x_T] -> u_t)
self.ci_enc_fwd = [None] * num_steps
self.ci_enc_rev = [None] * num_steps
if co_dim > 0:
enc_ci_cell = cell_class(hps.ci_enc_dim,
weight_scale=hps.cell_weight_scale,
clip_value=hps.cell_clip_value)
ci_enc_fwd = encode_data(dataset_do_bxtxd, enc_ci_cell,
"ci", "forward",
hps.num_steps)
if hps.do_causal_controller:
ci_enc_rev = None
else:
ci_enc_rev = encode_data(dataset_do_bxtxd, enc_ci_cell,
"ci", "reverse",
hps.num_steps)
self.ci_enc_fwd = ci_enc_fwd
self.ci_enc_rev = ci_enc_rev
# STOCHASTIC LATENT VARIABLES, priors and posteriors
# (initial conditions g0, and control inputs, u_t)
# Note that zs represent all the stochastic latent variables.
with tf.variable_scope("z", reuse=False):
self.prior_zs_g0 = None
self.posterior_zs_g0 = None
self.g0s_val = None
if ic_dim > 0:
self.prior_zs_g0 = \
LearnableDiagonalGaussian(batch_size, ic_dim, name="prior_g0",
mean_init=0.0,
var_min=hps.ic_prior_var_min,
var_init=hps.ic_prior_var_scale,
var_max=hps.ic_prior_var_max)
ic_enc = tf.concat(axis=1, values=[ic_enc_fwd[-1], ic_enc_rev[0]])
ic_enc = tf.nn.dropout(ic_enc, keep_prob)
self.posterior_zs_g0 = \
DiagonalGaussianFromInput(ic_enc, ic_dim, "ic_enc_2_post_g0",
var_min=hps.ic_post_var_min)
if kind in ["train", "posterior_sample_and_average",
"posterior_push_mean"]:
zs_g0 = self.posterior_zs_g0
else:
zs_g0 = self.prior_zs_g0
if kind in ["train", "posterior_sample_and_average", "prior_sample"]:
self.g0s_val = zs_g0.sample
else:
self.g0s_val = zs_g0.mean
# Priors for controller, 'co' for controller output
self.prior_zs_co = prior_zs_co = [None] * num_steps
self.posterior_zs_co = posterior_zs_co = [None] * num_steps
self.zs_co = zs_co = [None] * num_steps
self.prior_zs_ar_con = None
if co_dim > 0:
# Controller outputs
autocorrelation_taus = [hps.prior_ar_atau for x in range(hps.co_dim)]
noise_variances = [hps.prior_ar_nvar for x in range(hps.co_dim)]
self.prior_zs_ar_con = prior_zs_ar_con = \
LearnableAutoRegressive1Prior(batch_size, hps.co_dim,
autocorrelation_taus,
noise_variances,
hps.do_train_prior_ar_atau,
hps.do_train_prior_ar_nvar,
num_steps, "u_prior_ar1")
# CONTROLLER -> GENERATOR -> RATES
# (u(t) -> gen(t) -> factors(t) -> rates(t) -> p(x_t|z_t) )
self.controller_outputs = u_t = [None] * num_steps
self.con_ics = con_state = None
self.con_states = con_states = [None] * num_steps
self.con_outs = con_outs = [None] * num_steps
self.gen_inputs = gen_inputs = [None] * num_steps
if co_dim > 0:
# gen_cell_class here for l2 penalty recurrent weights
# didn't split the cell_weight scale here, because I doubt it matters
con_cell = gen_cell_class(hps.con_dim,
input_weight_scale=hps.cell_weight_scale,
rec_weight_scale=hps.cell_weight_scale,
clip_value=hps.cell_clip_value,
recurrent_collections=['l2_con_reg'])
with tf.variable_scope("con", reuse=False):
self.con_ics = tf.tile(
tf.Variable(tf.zeros([1, hps.con_dim*con_cell.state_multiplier]),
name="c0"),
tf.stack([batch_size, 1]))
self.con_ics.set_shape([None, con_cell.state_size]) # tile loses shape
con_states[-1] = self.con_ics
gen_cell = gen_cell_class(hps.gen_dim,
input_weight_scale=hps.gen_cell_input_weight_scale,
rec_weight_scale=hps.gen_cell_rec_weight_scale,
clip_value=hps.cell_clip_value,
recurrent_collections=['l2_gen_reg'])
with tf.variable_scope("gen", reuse=False):
if ic_dim == 0:
self.gen_ics = tf.tile(
tf.Variable(tf.zeros([1, gen_cell.state_size]), name="g0"),
tf.stack([batch_size, 1]))
else:
self.gen_ics = linear(self.g0s_val, gen_cell.state_size,
identity_if_possible=True,
name="g0_2_gen_ic")
self.gen_states = gen_states = [None] * num_steps
self.gen_outs = gen_outs = [None] * num_steps
gen_states[-1] = self.gen_ics
gen_outs[-1] = gen_cell.output_from_state(gen_states[-1])
self.factors = factors = [None] * num_steps
factors[-1] = linear(gen_outs[-1], factors_dim, do_bias=False,
normalized=True, name="gen_2_fac")
self.rates = rates = [None] * num_steps
# rates[-1] is collected to potentially feed back to controller
with tf.variable_scope("glm", reuse=False):
if hps.output_dist == 'poisson':
log_rates_t0 = tf.matmul(factors[-1], this_out_fac_W) + this_out_fac_b
log_rates_t0.set_shape([None, None])
rates[-1] = tf.exp(log_rates_t0) # rate
rates[-1].set_shape([None, hps.dataset_dims[hps.dataset_names[0]]])
elif hps.output_dist == 'gaussian':
mean_n_logvars = tf.matmul(factors[-1],this_out_fac_W) + this_out_fac_b
mean_n_logvars.set_shape([None, None])
means_t_bxd, logvars_t_bxd = tf.split(axis=1, num_or_size_splits=2,
value=mean_n_logvars)
rates[-1] = means_t_bxd
else:
assert False, "NIY"
# We support multiple output distributions, for example Poisson, and also
# Gaussian. In these two cases respectively, there are one and two
# parameters (rates vs. mean and variance). So the output_dist_params
# tensor will variable sizes via tf.concat and tf.split, along the 1st
# dimension. So in the case of gaussian, for example, it'll be
# batch x (D+D), where each D dims is the mean, and then variances,
# respectively. For a distribution with 3 parameters, it would be
# batch x (D+D+D).
self.output_dist_params = dist_params = [None] * num_steps
self.log_p_xgz_b = log_p_xgz_b = 0.0 # log P(x|z)
for t in range(num_steps):
# Controller
if co_dim > 0:
# Build inputs for controller
tlag = t - hps.controller_input_lag
if tlag < 0:
con_in_f_t = tf.zeros_like(ci_enc_fwd[0])
else:
con_in_f_t = ci_enc_fwd[tlag]
if hps.do_causal_controller:
# If controller is causal (wrt to data generation process), then it
# cannot see future data. Thus, excluding ci_enc_rev[t] is obvious.
# Less obvious is the need to exclude factors[t-1]. This arises
# because information flows from g0 through factors to the controller
# input. The g0 encoding is backwards, so we must necessarily exclude
# the factors in order to keep the controller input purely from a
# forward encoding (however unlikely it is that
# g0->factors->controller channel might actually be used in this way).
con_in_list_t = [con_in_f_t]
else:
tlag_rev = t + hps.controller_input_lag
if tlag_rev >= num_steps:
# better than zeros
con_in_r_t = tf.zeros_like(ci_enc_rev[0])
else:
con_in_r_t = ci_enc_rev[tlag_rev]
con_in_list_t = [con_in_f_t, con_in_r_t]
if hps.do_feed_factors_to_controller:
if hps.feedback_factors_or_rates == "factors":
con_in_list_t.append(factors[t-1])
elif hps.feedback_factors_or_rates == "rates":
con_in_list_t.append(rates[t-1])
else:
assert False, "NIY"
con_in_t = tf.concat(axis=1, values=con_in_list_t)
con_in_t = tf.nn.dropout(con_in_t, keep_prob)
with tf.variable_scope("con", reuse=True if t > 0 else None):
con_outs[t], con_states[t] = con_cell(con_in_t, con_states[t-1])
posterior_zs_co[t] = \
DiagonalGaussianFromInput(con_outs[t], co_dim,
name="con_to_post_co")
if kind == "train":
u_t[t] = posterior_zs_co[t].sample
elif kind == "posterior_sample_and_average":
u_t[t] = posterior_zs_co[t].sample
elif kind == "posterior_push_mean":
u_t[t] = posterior_zs_co[t].mean
else:
u_t[t] = prior_zs_ar_con.samples_t[t]
# Inputs to the generator (controller output + external input)
if ext_input_dim > 0 and hps.inject_ext_input_to_gen:
ext_input_t_bxi = ext_input_do_bxtxi[:,t,:]
if co_dim > 0:
gen_inputs[t] = tf.concat(axis=1, values=[u_t[t], ext_input_t_bxi])
else:
gen_inputs[t] = ext_input_t_bxi
else:
gen_inputs[t] = u_t[t]
# Generator
data_t_bxd = dataset_ph[:,t,:]
with tf.variable_scope("gen", reuse=True if t > 0 else None):
gen_outs[t], gen_states[t] = gen_cell(gen_inputs[t], gen_states[t-1])
gen_outs[t] = tf.nn.dropout(gen_outs[t], keep_prob)
with tf.variable_scope("gen", reuse=True): # ic defined it above
factors[t] = linear(gen_outs[t], factors_dim, do_bias=False,
normalized=True, name="gen_2_fac")
with tf.variable_scope("glm", reuse=True if t > 0 else None):
if hps.output_dist == 'poisson':
log_rates_t = tf.matmul(factors[t], this_out_fac_W) + this_out_fac_b
log_rates_t.set_shape([None, None])
rates[t] = dist_params[t] = tf.exp(tf.clip_by_value(log_rates_t, -hps._clip_value, hps._clip_value)) # rates feed back
rates[t].set_shape([None, hps.dataset_dims[hps.dataset_names[0]]])
loglikelihood_t = Poisson(log_rates_t).logp(data_t_bxd)
elif hps.output_dist == 'gaussian':
mean_n_logvars = tf.matmul(factors[t],this_out_fac_W) + this_out_fac_b
mean_n_logvars.set_shape([None, None])
means_t_bxd, logvars_t_bxd = tf.split(axis=1, num_or_size_splits=2,
value=mean_n_logvars)
rates[t] = means_t_bxd # rates feed back to controller
dist_params[t] = tf.concat(
axis=1, values=[means_t_bxd, tf.exp(tf.clip_by_value(logvars_t_bxd, -hps._clip_value, hps._clip_value))])
loglikelihood_t = \
diag_gaussian_log_likelihood(data_t_bxd,
means_t_bxd, logvars_t_bxd)
else:
assert False, "NIY"
log_p_xgz_b += tf.reduce_sum(loglikelihood_t, [1])
# Correlation of inferred inputs cost.
self.corr_cost = tf.constant(0.0)
if hps.co_mean_corr_scale > 0.0:
all_sum_corr = []
for i in range(hps.co_dim):
for j in range(i+1, hps.co_dim):
sum_corr_ij = tf.constant(0.0)
for t in range(num_steps):
u_mean_t = posterior_zs_co[t].mean
sum_corr_ij += u_mean_t[:,i]*u_mean_t[:,j]
all_sum_corr.append(0.5 * tf.square(sum_corr_ij))
self.corr_cost = tf.reduce_mean(all_sum_corr) # div by batch and by n*(n-1)/2 pairs
# Variational Lower Bound on posterior, p(z|x), plus reconstruction cost.
# KL and reconstruction costs are normalized only by batch size, not by
# dimension, or by time steps.
kl_cost_g0_b = tf.zeros_like(batch_size, dtype=tf.float32)
kl_cost_co_b = tf.zeros_like(batch_size, dtype=tf.float32)
self.kl_cost = tf.constant(0.0) # VAE KL cost
self.recon_cost = tf.constant(0.0) # VAE reconstruction cost
self.nll_bound_vae = tf.constant(0.0)
self.nll_bound_iwae = tf.constant(0.0) # for eval with IWAE cost.
if kind in ["train", "posterior_sample_and_average", "posterior_push_mean"]:
kl_cost_g0_b = 0.0
kl_cost_co_b = 0.0
if ic_dim > 0:
g0_priors = [self.prior_zs_g0]
g0_posts = [self.posterior_zs_g0]
kl_cost_g0_b = KLCost_GaussianGaussian(g0_posts, g0_priors).kl_cost_b
kl_cost_g0_b = hps.kl_ic_weight * kl_cost_g0_b
if co_dim > 0:
kl_cost_co_b = \
KLCost_GaussianGaussianProcessSampled(
posterior_zs_co, prior_zs_ar_con).kl_cost_b
kl_cost_co_b = hps.kl_co_weight * kl_cost_co_b
# L = -KL + log p(x|z), to maximize bound on likelihood
# -L = KL - log p(x|z), to minimize bound on NLL
# so 'reconstruction cost' is negative log likelihood
self.recon_cost = - tf.reduce_mean(log_p_xgz_b)
self.kl_cost = tf.reduce_mean(kl_cost_g0_b + kl_cost_co_b)
lb_on_ll_b = log_p_xgz_b - kl_cost_g0_b - kl_cost_co_b
# VAE error averages outside the log
self.nll_bound_vae = -tf.reduce_mean(lb_on_ll_b)
# IWAE error averages inside the log
k = tf.cast(tf.shape(log_p_xgz_b)[0], tf.float32)
iwae_lb_on_ll = -tf.log(k) + log_sum_exp(lb_on_ll_b)
self.nll_bound_iwae = -iwae_lb_on_ll
# L2 regularization on the generator, normalized by number of parameters.
self.l2_cost = tf.constant(0.0)
if self.hps.l2_gen_scale > 0.0 or self.hps.l2_con_scale > 0.0:
l2_costs = []
l2_numels = []
l2_reg_var_lists = [tf.get_collection('l2_gen_reg'),
tf.get_collection('l2_con_reg')]
l2_reg_scales = [self.hps.l2_gen_scale, self.hps.l2_con_scale]
for l2_reg_vars, l2_scale in zip(l2_reg_var_lists, l2_reg_scales):
for v in l2_reg_vars:
numel = tf.reduce_prod(tf.concat(axis=0, values=tf.shape(v)))
numel_f = tf.cast(numel, tf.float32)
l2_numels.append(numel_f)
v_l2 = tf.reduce_sum(v*v)
l2_costs.append(0.5 * l2_scale * v_l2)
self.l2_cost = tf.add_n(l2_costs) / tf.add_n(l2_numels)
# Compute the cost for training, part of the graph regardless.
# The KL cost can be problematic at the beginning of optimization,
# so we allow an exponential increase in weighting the KL from 0
# to 1.
self.kl_decay_step = tf.maximum(self.train_step - hps.kl_start_step, 0)
self.l2_decay_step = tf.maximum(self.train_step - hps.l2_start_step, 0)
kl_decay_step_f = tf.cast(self.kl_decay_step, tf.float32)
l2_decay_step_f = tf.cast(self.l2_decay_step, tf.float32)
kl_increase_steps_f = tf.cast(hps.kl_increase_steps, tf.float32)
l2_increase_steps_f = tf.cast(hps.l2_increase_steps, tf.float32)
self.kl_weight = kl_weight = \
tf.minimum(kl_decay_step_f / kl_increase_steps_f, 1.0)
self.l2_weight = l2_weight = \
tf.minimum(l2_decay_step_f / l2_increase_steps_f, 1.0)
self.timed_kl_cost = kl_weight * self.kl_cost
self.timed_l2_cost = l2_weight * self.l2_cost
self.weight_corr_cost = hps.co_mean_corr_scale * self.corr_cost
self.cost = self.recon_cost + self.timed_kl_cost + \
self.timed_l2_cost + self.weight_corr_cost
if kind != "train":
# save every so often
self.seso_saver = tf.train.Saver(tf.global_variables(),
max_to_keep=hps.max_ckpt_to_keep)
# lowest validation error
self.lve_saver = tf.train.Saver(tf.global_variables(),
max_to_keep=hps.max_ckpt_to_keep_lve)
return
# OPTIMIZATION
# train the io matrices only
if self.hps.do_train_io_only:
self.train_vars = tvars = \
tf.get_collection('IO_transformations',
scope=tf.get_variable_scope().name)
# train the encoder only
elif self.hps.do_train_encoder_only:
tvars1 = \
tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope='LFADS/ic_enc_*')
tvars2 = \
tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope='LFADS/z/ic_enc_*')
self.train_vars = tvars = tvars1 + tvars2
# train all variables
else:
self.train_vars = tvars = \
tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=tf.get_variable_scope().name)
print("done.")
print("Model Variables (to be optimized): ")
total_params = 0
for i in range(len(tvars)):
shape = tvars[i].get_shape().as_list()
print(" ", i, tvars[i].name, shape)
total_params += np.prod(shape)
print("Total model parameters: ", total_params)
grads = tf.gradients(self.cost, tvars)
grads, grad_global_norm = tf.clip_by_global_norm(grads, hps.max_grad_norm)
opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-01)
self.grads = grads
self.grad_global_norm = grad_global_norm
self.train_op = opt.apply_gradients(
zip(grads, tvars), global_step=self.train_step)
self.seso_saver = tf.train.Saver(tf.global_variables(),
max_to_keep=hps.max_ckpt_to_keep)
# lowest validation error
self.lve_saver = tf.train.Saver(tf.global_variables(),
max_to_keep=hps.max_ckpt_to_keep)
# SUMMARIES, used only during training.
# example summary
self.example_image = tf.placeholder(tf.float32, shape=[1,None,None,3],
name='image_tensor')
self.example_summ = tf.summary.image("LFADS example", self.example_image,
collections=["example_summaries"])
# general training summaries
self.lr_summ = tf.summary.scalar("Learning rate", self.learning_rate)
self.kl_weight_summ = tf.summary.scalar("KL weight", self.kl_weight)
self.l2_weight_summ = tf.summary.scalar("L2 weight", self.l2_weight)
self.corr_cost_summ = tf.summary.scalar("Corr cost", self.weight_corr_cost)
self.grad_global_norm_summ = tf.summary.scalar("Gradient global norm",
self.grad_global_norm)
if hps.co_dim > 0:
self.atau_summ = [None] * hps.co_dim
self.pvar_summ = [None] * hps.co_dim
for c in range(hps.co_dim):
self.atau_summ[c] = \
tf.summary.scalar("AR Autocorrelation taus " + str(c),
tf.exp(self.prior_zs_ar_con.logataus_1xu[0,c]))
self.pvar_summ[c] = \
tf.summary.scalar("AR Variances " + str(c),
tf.exp(self.prior_zs_ar_con.logpvars_1xu[0,c]))
# cost summaries, separated into different collections for
# training vs validation. We make placeholders for these, because
# even though the graph computes these costs on a per-batch basis,
# we want to report the more reliable metric of per-epoch cost.
kl_cost_ph = tf.placeholder(tf.float32, shape=[], name='kl_cost_ph')
self.kl_t_cost_summ = tf.summary.scalar("KL cost (train)", kl_cost_ph,
collections=["train_summaries"])
self.kl_v_cost_summ = tf.summary.scalar("KL cost (valid)", kl_cost_ph,
collections=["valid_summaries"])
l2_cost_ph = tf.placeholder(tf.float32, shape=[], name='l2_cost_ph')
self.l2_cost_summ = tf.summary.scalar("L2 cost", l2_cost_ph,
collections=["train_summaries"])
recon_cost_ph = tf.placeholder(tf.float32, shape=[], name='recon_cost_ph')
self.recon_t_cost_summ = tf.summary.scalar("Reconstruction cost (train)",
recon_cost_ph,
collections=["train_summaries"])
self.recon_v_cost_summ = tf.summary.scalar("Reconstruction cost (valid)",
recon_cost_ph,
collections=["valid_summaries"])
total_cost_ph = tf.placeholder(tf.float32, shape=[], name='total_cost_ph')
self.cost_t_summ = tf.summary.scalar("Total cost (train)", total_cost_ph,
collections=["train_summaries"])
self.cost_v_summ = tf.summary.scalar("Total cost (valid)", total_cost_ph,
collections=["valid_summaries"])
self.kl_cost_ph = kl_cost_ph
self.l2_cost_ph = l2_cost_ph
self.recon_cost_ph = recon_cost_ph
self.total_cost_ph = total_cost_ph
# Merged summaries, for easy coding later.
self.merged_examples = tf.summary.merge_all(key="example_summaries")
self.merged_generic = tf.summary.merge_all() # default key is 'summaries'
self.merged_train = tf.summary.merge_all(key="train_summaries")
self.merged_valid = tf.summary.merge_all(key="valid_summaries")
session = tf.get_default_session()
self.logfile = os.path.join(hps.lfads_save_dir, "lfads_log")
self.writer = tf.summary.FileWriter(self.logfile)
def build_feed_dict(self, train_name, data_bxtxd, ext_input_bxtxi=None,
keep_prob=None):
"""Build the feed dictionary, handles cases where there is no value defined.
Args:
train_name: The key into the datasets, to set the tf.case statement for
the proper readin / readout matrices.
data_bxtxd: The data tensor
ext_input_bxtxi (optional): The external input tensor
keep_prob: The drop out keep probability.
Returns:
The feed dictionary with TF tensors as keys and data as values, for use
with tf.Session.run()
"""
feed_dict = {}
B, T, _ = data_bxtxd.shape
feed_dict[self.dataName] = train_name
feed_dict[self.dataset_ph] = data_bxtxd
if self.ext_input is not None and ext_input_bxtxi is not None:
feed_dict[self.ext_input] = ext_input_bxtxi
if keep_prob is None:
feed_dict[self.keep_prob] = self.hps.keep_prob
else:
feed_dict[self.keep_prob] = keep_prob
return feed_dict
@staticmethod
def get_batch(data_extxd, ext_input_extxi=None, batch_size=None,
example_idxs=None):
"""Get a batch of data, either randomly chosen, or specified directly.
Args:
data_extxd: The data to model, numpy tensors with shape:
# examples x # time steps x # dimensions
ext_input_extxi (optional): The external inputs, numpy tensor with shape:
# examples x # time steps x # external input dimensions
batch_size: The size of the batch to return
example_idxs (optional): The example indices used to select examples.
Returns:
A tuple with two parts:
1. Batched data numpy tensor with shape:
batch_size x # time steps x # dimensions
2. Batched external input numpy tensor with shape:
batch_size x # time steps x # external input dims
"""
assert batch_size is not None or example_idxs is not None, "Problems"
E, T, D = data_extxd.shape
if example_idxs is None:
example_idxs = np.random.choice(E, batch_size)
ext_input_bxtxi = None
if ext_input_extxi is not None:
ext_input_bxtxi = ext_input_extxi[example_idxs,:,:]
return data_extxd[example_idxs,:,:], ext_input_bxtxi
@staticmethod
def example_idxs_mod_batch_size(nexamples, batch_size):
"""Given a number of examples, E, and a batch_size, B, generate indices
[0, 1, 2, ... B-1;
[B, B+1, ... 2*B-1;
...
]
returning those indices as a 2-dim tensor shaped like E/B x B. Note that
shape is only correct if E % B == 0. If not, then an extra row is generated
so that the remainder of examples is included. The extra examples are
explicitly to to the zero index (see randomize_example_idxs_mod_batch_size)
for randomized behavior.
Args:
nexamples: The number of examples to batch up.
batch_size: The size of the batch.
Returns:
2-dim tensor as described above.
"""
bmrem = batch_size - (nexamples % batch_size)
bmrem_examples = []
if bmrem < batch_size:
#bmrem_examples = np.zeros(bmrem, dtype=np.int32)
ridxs = np.random.permutation(nexamples)[0:bmrem].astype(np.int32)
bmrem_examples = np.sort(ridxs)
example_idxs = range(nexamples) + list(bmrem_examples)
example_idxs_e_x_edivb = np.reshape(example_idxs, [-1, batch_size])
return example_idxs_e_x_edivb, bmrem
@staticmethod
def randomize_example_idxs_mod_batch_size(nexamples, batch_size):
"""Indices 1:nexamples, randomized, in 2D form of
shape = (nexamples / batch_size) x batch_size. The remainder
is managed by drawing randomly from 1:nexamples.
Args:
nexamples: number of examples to randomize
batch_size: number of elements in batch
Returns:
The randomized, properly shaped indicies.
"""
assert nexamples > batch_size, "Problems"
bmrem = batch_size - nexamples % batch_size
bmrem_examples = []
if bmrem < batch_size:
bmrem_examples = np.random.choice(range(nexamples),
size=bmrem, replace=False)
example_idxs = range(nexamples) + list(bmrem_examples)
mixed_example_idxs = np.random.permutation(example_idxs)
example_idxs_e_x_edivb = np.reshape(mixed_example_idxs, [-1, batch_size])
return example_idxs_e_x_edivb, bmrem
def shuffle_spikes_in_time(self, data_bxtxd):
"""Shuffle the spikes in the temporal dimension. This is useful to
help the LFADS system avoid overfitting to individual spikes or fast
oscillations found in the data that are irrelevant to behavior. A
pure 'tabula rasa' approach would avoid this, but LFADS is sensitive
enough to pick up dynamics that you may not want.
Args:
data_bxtxd: numpy array of spike count data to be shuffled.
Returns:
S_bxtxd, a numpy array with the same dimensions and contents as
data_bxtxd, but shuffled appropriately.
"""
B, T, N = data_bxtxd.shape
w = self.hps.temporal_spike_jitter_width
if w == 0:
return data_bxtxd
max_counts = np.max(data_bxtxd)
S_bxtxd = np.zeros([B,T,N])
# Intuitively, shuffle spike occurances, 0 or 1, but since we have counts,
# Do it over and over again up to the max count.
for mc in range(1,max_counts+1):
idxs = np.nonzero(data_bxtxd >= mc)
data_ones = np.zeros_like(data_bxtxd)
data_ones[data_bxtxd >= mc] = 1
nfound = len(idxs[0])
shuffles_incrs_in_time = np.random.randint(-w, w, size=nfound)
shuffle_tidxs = idxs[1].copy()
shuffle_tidxs += shuffles_incrs_in_time
# Reflect on the boundaries to not lose mass.
shuffle_tidxs[shuffle_tidxs < 0] = -shuffle_tidxs[shuffle_tidxs < 0]
shuffle_tidxs[shuffle_tidxs > T-1] = \
(T-1)-(shuffle_tidxs[shuffle_tidxs > T-1] -(T-1))
for iii in zip(idxs[0], shuffle_tidxs, idxs[2]):
S_bxtxd[iii] += 1
return S_bxtxd
def shuffle_and_flatten_datasets(self, datasets, kind='train'):
"""Since LFADS supports multiple datasets in the same dynamical model,
we have to be careful to use all the data in a single training epoch. But
since the datasets my have different data dimensionality, we cannot batch
examples from data dictionaries together. Instead, we generate random
batches within each data dictionary, and then randomize these batches
while holding onto the dataname, so that when it's time to feed
the graph, the correct in/out matrices can be selected, per batch.
Args:
datasets: A dict of data dicts. The dataset dict is simply a
name(string)-> data dictionary mapping (See top of lfads.py).
kind: 'train' or 'valid'
Returns:
A flat list, in which each element is a pair ('name', indices).
"""
batch_size = self.hps.batch_size
ndatasets = len(datasets)
random_example_idxs = {}
epoch_idxs = {}
all_name_example_idx_pairs = []
kind_data = kind + '_data'
for name, data_dict in datasets.items():
nexamples, ntime, data_dim = data_dict[kind_data].shape
epoch_idxs[name] = 0
random_example_idxs, _ = \
self.randomize_example_idxs_mod_batch_size(nexamples, batch_size)
epoch_size = random_example_idxs.shape[0]
names = [name] * epoch_size
all_name_example_idx_pairs += zip(names, random_example_idxs)
np.random.shuffle(all_name_example_idx_pairs) # shuffle in place
return all_name_example_idx_pairs
def train_epoch(self, datasets, batch_size=None, do_save_ckpt=True):
"""Train the model through the entire dataset once.
Args:
datasets: A dict of data dicts. The dataset dict is simply a
name(string)-> data dictionary mapping (See top of lfads.py).
batch_size (optional): The batch_size to use
do_save_ckpt (optional): Should the routine save a checkpoint on this
training epoch?
Returns:
A tuple with 6 float values:
(total cost of the epoch, epoch reconstruction cost,
epoch kl cost, KL weight used this training epoch,
total l2 cost on generator, and the corresponding weight).
"""
ops_to_eval = [self.cost, self.recon_cost,
self.kl_cost, self.kl_weight,
self.l2_cost, self.l2_weight,
self.train_op]
collected_op_values = self.run_epoch(datasets, ops_to_eval, kind="train")
total_cost = total_recon_cost = total_kl_cost = 0.0
# normalizing by batch done in distributions.py
epoch_size = len(collected_op_values)
for op_values in collected_op_values:
total_cost += op_values[0]
total_recon_cost += op_values[1]
total_kl_cost += op_values[2]
kl_weight = collected_op_values[-1][3]
l2_cost = collected_op_values[-1][4]
l2_weight = collected_op_values[-1][5]
epoch_total_cost = total_cost / epoch_size
epoch_recon_cost = total_recon_cost / epoch_size
epoch_kl_cost = total_kl_cost / epoch_size
if do_save_ckpt:
session = tf.get_default_session()
checkpoint_path = os.path.join(self.hps.lfads_save_dir,
self.hps.checkpoint_name + '.ckpt')
self.seso_saver.save(session, checkpoint_path,
global_step=self.train_step)
return epoch_total_cost, epoch_recon_cost, epoch_kl_cost, \
kl_weight, l2_cost, l2_weight
def run_epoch(self, datasets, ops_to_eval, kind="train", batch_size=None,
do_collect=True, keep_prob=None):
"""Run the model through the entire dataset once.
Args:
datasets: A dict of data dicts. The dataset dict is simply a
name(string)-> data dictionary mapping (See top of lfads.py).
ops_to_eval: A list of tensorflow operations that will be evaluated in
the tf.session.run() call.
batch_size (optional): The batch_size to use
do_collect (optional): Should the routine collect all session.run
output as a list, and return it?
keep_prob (optional): The dropout keep probability.
Returns:
A list of lists, the internal list is the return for the ops for each
session.run() call. The outer list collects over the epoch.
"""
hps = self.hps
all_name_example_idx_pairs = \
self.shuffle_and_flatten_datasets(datasets, kind)
kind_data = kind + '_data'
kind_ext_input = kind + '_ext_input'
total_cost = total_recon_cost = total_kl_cost = 0.0
session = tf.get_default_session()
epoch_size = len(all_name_example_idx_pairs)
evaled_ops_list = []
for name, example_idxs in all_name_example_idx_pairs:
data_dict = datasets[name]
data_extxd = data_dict[kind_data]
if hps.output_dist == 'poisson' and hps.temporal_spike_jitter_width > 0:
data_extxd = self.shuffle_spikes_in_time(data_extxd)
ext_input_extxi = data_dict[kind_ext_input]
data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd, ext_input_extxi,
example_idxs=example_idxs)
feed_dict = self.build_feed_dict(name, data_bxtxd, ext_input_bxtxi,
keep_prob=keep_prob)
evaled_ops_np = session.run(ops_to_eval, feed_dict=feed_dict)
if do_collect:
evaled_ops_list.append(evaled_ops_np)
return evaled_ops_list
def summarize_all(self, datasets, summary_values):
"""Plot and summarize stuff in tensorboard.
Note that everything done in the current function is otherwise done on
a single, randomly selected dataset (except for summary_values, which are
passed in.)
Args:
datasets, the dictionary of datasets used in the study.
summary_values: These summary values are created from the training loop,
and so summarize the entire set of datasets.
"""
hps = self.hps
tr_kl_cost = summary_values['tr_kl_cost']
tr_recon_cost = summary_values['tr_recon_cost']
tr_total_cost = summary_values['tr_total_cost']
kl_weight = summary_values['kl_weight']
l2_weight = summary_values['l2_weight']
l2_cost = summary_values['l2_cost']
has_any_valid_set = summary_values['has_any_valid_set']
i = summary_values['nepochs']
session = tf.get_default_session()
train_summ, train_step = session.run([self.merged_train,
self.train_step],
feed_dict={self.l2_cost_ph:l2_cost,
self.kl_cost_ph:tr_kl_cost,
self.recon_cost_ph:tr_recon_cost,
self.total_cost_ph:tr_total_cost})
self.writer.add_summary(train_summ, train_step)
if has_any_valid_set:
ev_kl_cost = summary_values['ev_kl_cost']
ev_recon_cost = summary_values['ev_recon_cost']
ev_total_cost = summary_values['ev_total_cost']
eval_summ = session.run(self.merged_valid,
feed_dict={self.kl_cost_ph:ev_kl_cost,
self.recon_cost_ph:ev_recon_cost,
self.total_cost_ph:ev_total_cost})
self.writer.add_summary(eval_summ, train_step)
print("Epoch:%d, step:%d (TRAIN, VALID): total: %.2f, %.2f\
recon: %.2f, %.2f, kl: %.2f, %.2f, l2: %.5f,\
kl weight: %.2f, l2 weight: %.2f" % \
(i, train_step, tr_total_cost, ev_total_cost,
tr_recon_cost, ev_recon_cost, tr_kl_cost, ev_kl_cost,
l2_cost, kl_weight, l2_weight))
csv_outstr = "epoch,%d, step,%d, total,%.2f,%.2f, \
recon,%.2f,%.2f, kl,%.2f,%.2f, l2,%.5f, \
klweight,%.2f, l2weight,%.2f\n"% \
(i, train_step, tr_total_cost, ev_total_cost,
tr_recon_cost, ev_recon_cost, tr_kl_cost, ev_kl_cost,
l2_cost, kl_weight, l2_weight)
else:
print("Epoch:%d, step:%d TRAIN: total: %.2f recon: %.2f, kl: %.2f,\
l2: %.5f, kl weight: %.2f, l2 weight: %.2f" % \
(i, train_step, tr_total_cost, tr_recon_cost, tr_kl_cost,
l2_cost, kl_weight, l2_weight))
csv_outstr = "epoch,%d, step,%d, total,%.2f, recon,%.2f, kl,%.2f, \
l2,%.5f, klweight,%.2f, l2weight,%.2f\n"% \
(i, train_step, tr_total_cost, tr_recon_cost,
tr_kl_cost, l2_cost, kl_weight, l2_weight)
if self.hps.csv_log:
csv_file = os.path.join(self.hps.lfads_save_dir, self.hps.csv_log+'.csv')
with open(csv_file, "a") as myfile:
myfile.write(csv_outstr)
def plot_single_example(self, datasets):
"""Plot an image relating to a randomly chosen, specific example. We use
posterior sample and average by taking one example, and filling a whole
batch with that example, sample from the posterior, and then average the
quantities.
"""
hps = self.hps
all_data_names = datasets.keys()
data_name = np.random.permutation(all_data_names)[0]
data_dict = datasets[data_name]
has_valid_set = True if data_dict['valid_data'] is not None else False
cf = 1.0 # plotting concern
# posterior sample and average here
E, _, _ = data_dict['train_data'].shape
eidx = np.random.choice(E)
example_idxs = eidx * np.ones(hps.batch_size, dtype=np.int32)
train_data_bxtxd, train_ext_input_bxtxi = \
self.get_batch(data_dict['train_data'], data_dict['train_ext_input'],
example_idxs=example_idxs)
truth_train_data_bxtxd = None
if 'train_truth' in data_dict and data_dict['train_truth'] is not None:
truth_train_data_bxtxd, _ = self.get_batch(data_dict['train_truth'],
example_idxs=example_idxs)
cf = data_dict['conversion_factor']
# plotter does averaging
train_model_values = self.eval_model_runs_batch(data_name,
train_data_bxtxd,
train_ext_input_bxtxi,
do_average_batch=False)
train_step = train_model_values['train_steps']
feed_dict = self.build_feed_dict(data_name, train_data_bxtxd,
train_ext_input_bxtxi, keep_prob=1.0)
session = tf.get_default_session()
generic_summ = session.run(self.merged_generic, feed_dict=feed_dict)
self.writer.add_summary(generic_summ, train_step)
valid_data_bxtxd = valid_model_values = valid_ext_input_bxtxi = None
truth_valid_data_bxtxd = None
if has_valid_set:
E, _, _ = data_dict['valid_data'].shape
eidx = np.random.choice(E)
example_idxs = eidx * np.ones(hps.batch_size, dtype=np.int32)
valid_data_bxtxd, valid_ext_input_bxtxi = \
self.get_batch(data_dict['valid_data'],
data_dict['valid_ext_input'],
example_idxs=example_idxs)
if 'valid_truth' in data_dict and data_dict['valid_truth'] is not None:
truth_valid_data_bxtxd, _ = self.get_batch(data_dict['valid_truth'],
example_idxs=example_idxs)
else:
truth_valid_data_bxtxd = None
# plotter does averaging
valid_model_values = self.eval_model_runs_batch(data_name,
valid_data_bxtxd,
valid_ext_input_bxtxi,
do_average_batch=False)
example_image = plot_lfads(train_bxtxd=train_data_bxtxd,
train_model_vals=train_model_values,
train_ext_input_bxtxi=train_ext_input_bxtxi,
train_truth_bxtxd=truth_train_data_bxtxd,
valid_bxtxd=valid_data_bxtxd,
valid_model_vals=valid_model_values,
valid_ext_input_bxtxi=valid_ext_input_bxtxi,
valid_truth_bxtxd=truth_valid_data_bxtxd,
bidx=None, cf=cf, output_dist=hps.output_dist)
example_image = np.expand_dims(example_image, axis=0)
example_summ = session.run(self.merged_examples,
feed_dict={self.example_image : example_image})
self.writer.add_summary(example_summ)
def train_model(self, datasets):
"""Train the model, print per-epoch information, and save checkpoints.
Loop over training epochs. The function that actually does the
training is train_epoch. This function iterates over the training
data, one epoch at a time. The learning rate schedule is such
that it will stay the same until the cost goes up in comparison to
the last few values, then it will drop.
Args:
datasets: A dict of data dicts. The dataset dict is simply a
name(string)-> data dictionary mapping (See top of lfads.py).
"""
hps = self.hps
has_any_valid_set = False
for data_dict in datasets.values():
if data_dict['valid_data'] is not None:
has_any_valid_set = True
break
session = tf.get_default_session()
lr = session.run(self.learning_rate)
lr_stop = hps.learning_rate_stop
i = -1
train_costs = []
valid_costs = []
ev_total_cost = ev_recon_cost = ev_kl_cost = 0.0
lowest_ev_cost = np.Inf
while True:
i += 1
do_save_ckpt = True if i % 10 ==0 else False
tr_total_cost, tr_recon_cost, tr_kl_cost, kl_weight, l2_cost, l2_weight = \
self.train_epoch(datasets, do_save_ckpt=do_save_ckpt)
# Evaluate the validation cost, and potentially save. Note that this
# routine will not save a validation checkpoint until the kl weight and
# l2 weights are equal to 1.0.
if has_any_valid_set:
ev_total_cost, ev_recon_cost, ev_kl_cost = \
self.eval_cost_epoch(datasets, kind='valid')
valid_costs.append(ev_total_cost)
# > 1 may give more consistent results, but not the actual lowest vae.
# == 1 gives the lowest vae seen so far.
n_lve = 1
run_avg_lve = np.mean(valid_costs[-n_lve:])
# conditions for saving checkpoints:
# KL weight must have finished stepping (>=1.0), AND
# L2 weight must have finished stepping OR L2 is not being used, AND
# the current run has a lower LVE than previous runs AND
# len(valid_costs > n_lve) (not sure what that does)
if kl_weight >= 1.0 and \
(l2_weight >= 1.0 or \
(self.hps.l2_gen_scale == 0.0 and self.hps.l2_con_scale == 0.0)) \
and (len(valid_costs) > n_lve and run_avg_lve < lowest_ev_cost):
lowest_ev_cost = run_avg_lve
checkpoint_path = os.path.join(self.hps.lfads_save_dir,
self.hps.checkpoint_name + '_lve.ckpt')
self.lve_saver.save(session, checkpoint_path,
global_step=self.train_step,
latest_filename='checkpoint_lve')
# Plot and summarize.
values = {'nepochs':i, 'has_any_valid_set': has_any_valid_set,
'tr_total_cost':tr_total_cost, 'ev_total_cost':ev_total_cost,
'tr_recon_cost':tr_recon_cost, 'ev_recon_cost':ev_recon_cost,
'tr_kl_cost':tr_kl_cost, 'ev_kl_cost':ev_kl_cost,
'l2_weight':l2_weight, 'kl_weight':kl_weight,
'l2_cost':l2_cost}
self.summarize_all(datasets, values)
self.plot_single_example(datasets)
# Manage learning rate.
train_res = tr_total_cost
n_lr = hps.learning_rate_n_to_compare
if len(train_costs) > n_lr and train_res > np.max(train_costs[-n_lr:]):
_ = session.run(self.learning_rate_decay_op)
lr = session.run(self.learning_rate)
print(" Decreasing learning rate to %f." % lr)
# Force the system to run n_lr times while at this lr.
train_costs.append(np.inf)
else:
train_costs.append(train_res)
if lr < lr_stop:
print("Stopping optimization based on learning rate criteria.")
break
def eval_cost_epoch(self, datasets, kind='train', ext_input_extxi=None,
batch_size=None):
"""Evaluate the cost of the epoch.
Args:
data_dict: The dictionary of data (training and validation) used for
training and evaluation of the model, respectively.
Returns:
a 3 tuple of costs:
(epoch total cost, epoch reconstruction cost, epoch KL cost)
"""
ops_to_eval = [self.cost, self.recon_cost, self.kl_cost]
collected_op_values = self.run_epoch(datasets, ops_to_eval, kind=kind,
keep_prob=1.0)
total_cost = total_recon_cost = total_kl_cost = 0.0
# normalizing by batch done in distributions.py
epoch_size = len(collected_op_values)
for op_values in collected_op_values:
total_cost += op_values[0]
total_recon_cost += op_values[1]
total_kl_cost += op_values[2]
epoch_total_cost = total_cost / epoch_size
epoch_recon_cost = total_recon_cost / epoch_size
epoch_kl_cost = total_kl_cost / epoch_size
return epoch_total_cost, epoch_recon_cost, epoch_kl_cost
def eval_model_runs_batch(self, data_name, data_bxtxd, ext_input_bxtxi=None,
do_eval_cost=False, do_average_batch=False):
"""Returns all the goodies for the entire model, per batch.
If data_bxtxd and ext_input_bxtxi can have fewer than batch_size along dim 1
in which case this handles the padding and truncating automatically
Args:
data_name: The name of the data dict, to select which in/out matrices
to use.
data_bxtxd: Numpy array training data with shape:
batch_size x # time steps x # dimensions
ext_input_bxtxi: Numpy array training external input with shape:
batch_size x # time steps x # external input dims
do_eval_cost (optional): If true, the IWAE (Importance Weighted
Autoencoder) log likeihood bound, instead of the VAE version.
do_average_batch (optional): average over the batch, useful for getting
good IWAE costs, and model outputs for a single data point.
Returns:
A dictionary with the outputs of the model decoder, namely:
prior g0 mean, prior g0 variance, approx. posterior mean, approx
posterior mean, the generator initial conditions, the control inputs (if
enabled), the state of the generator, the factors, and the rates.
"""
session = tf.get_default_session()
# if fewer than batch_size provided, pad to batch_size
hps = self.hps
batch_size = hps.batch_size
E, _, _ = data_bxtxd.shape
if E < hps.batch_size:
data_bxtxd = np.pad(data_bxtxd, ((0, hps.batch_size-E), (0, 0), (0, 0)),
mode='constant', constant_values=0)
if ext_input_bxtxi is not None:
ext_input_bxtxi = np.pad(ext_input_bxtxi,
((0, hps.batch_size-E), (0, 0), (0, 0)),
mode='constant', constant_values=0)
feed_dict = self.build_feed_dict(data_name, data_bxtxd,
ext_input_bxtxi, keep_prob=1.0)
# Non-temporal signals will be batch x dim.
# Temporal signals are list length T with elements batch x dim.
tf_vals = [self.gen_ics, self.gen_states, self.factors,
self.output_dist_params]
tf_vals.append(self.cost)
tf_vals.append(self.nll_bound_vae)
tf_vals.append(self.nll_bound_iwae)
tf_vals.append(self.train_step) # not train_op!
if self.hps.ic_dim > 0:
tf_vals += [self.prior_zs_g0.mean, self.prior_zs_g0.logvar,
self.posterior_zs_g0.mean, self.posterior_zs_g0.logvar]
if self.hps.co_dim > 0:
tf_vals.append(self.controller_outputs)
tf_vals_flat, fidxs = flatten(tf_vals)
np_vals_flat = session.run(tf_vals_flat, feed_dict=feed_dict)
ff = 0
gen_ics = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
gen_states = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
factors = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
out_dist_params = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
costs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
nll_bound_vaes = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
nll_bound_iwaes = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
train_steps = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
if self.hps.ic_dim > 0:
prior_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
prior_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
post_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
post_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
if self.hps.co_dim > 0:
controller_outputs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
# [0] are to take out the non-temporal items from lists
gen_ics = gen_ics[0]
costs = costs[0]
nll_bound_vaes = nll_bound_vaes[0]
nll_bound_iwaes = nll_bound_iwaes[0]
train_steps = train_steps[0]
# Convert to full tensors, not lists of tensors in time dim.
gen_states = list_t_bxn_to_tensor_bxtxn(gen_states)
factors = list_t_bxn_to_tensor_bxtxn(factors)
out_dist_params = list_t_bxn_to_tensor_bxtxn(out_dist_params)
if self.hps.ic_dim > 0:
# select first time point
prior_g0_mean = prior_g0_mean[0]
prior_g0_logvar = prior_g0_logvar[0]
post_g0_mean = post_g0_mean[0]
post_g0_logvar = post_g0_logvar[0]
if self.hps.co_dim > 0:
controller_outputs = list_t_bxn_to_tensor_bxtxn(controller_outputs)
# slice out the trials in case < batch_size provided
if E < hps.batch_size:
idx = np.arange(E)
gen_ics = gen_ics[idx, :]
gen_states = gen_states[idx, :]
factors = factors[idx, :, :]
out_dist_params = out_dist_params[idx, :, :]
if self.hps.ic_dim > 0:
prior_g0_mean = prior_g0_mean[idx, :]
prior_g0_logvar = prior_g0_logvar[idx, :]
post_g0_mean = post_g0_mean[idx, :]
post_g0_logvar = post_g0_logvar[idx, :]
if self.hps.co_dim > 0:
controller_outputs = controller_outputs[idx, :, :]
if do_average_batch:
gen_ics = np.mean(gen_ics, axis=0)
gen_states = np.mean(gen_states, axis=0)
factors = np.mean(factors, axis=0)
out_dist_params = np.mean(out_dist_params, axis=0)
if self.hps.ic_dim > 0:
prior_g0_mean = np.mean(prior_g0_mean, axis=0)
prior_g0_logvar = np.mean(prior_g0_logvar, axis=0)
post_g0_mean = np.mean(post_g0_mean, axis=0)
post_g0_logvar = np.mean(post_g0_logvar, axis=0)
if self.hps.co_dim > 0:
controller_outputs = np.mean(controller_outputs, axis=0)
model_vals = {}
model_vals['gen_ics'] = gen_ics
model_vals['gen_states'] = gen_states
model_vals['factors'] = factors
model_vals['output_dist_params'] = out_dist_params
model_vals['costs'] = costs
model_vals['nll_bound_vaes'] = nll_bound_vaes
model_vals['nll_bound_iwaes'] = nll_bound_iwaes
model_vals['train_steps'] = train_steps
if self.hps.ic_dim > 0:
model_vals['prior_g0_mean'] = prior_g0_mean
model_vals['prior_g0_logvar'] = prior_g0_logvar
model_vals['post_g0_mean'] = post_g0_mean
model_vals['post_g0_logvar'] = post_g0_logvar
if self.hps.co_dim > 0:
model_vals['controller_outputs'] = controller_outputs
return model_vals
def eval_model_runs_avg_epoch(self, data_name, data_extxd,
ext_input_extxi=None):
"""Returns all the expected value for goodies for the entire model.
The expected value is taken over hidden (z) variables, namely the initial
conditions and the control inputs. The expected value is approximate, and
accomplished via sampling (batch_size) samples for every examples.
Args:
data_name: The name of the data dict, to select which in/out matrices
to use.
data_extxd: Numpy array training data with shape:
# examples x # time steps x # dimensions
ext_input_extxi (optional): Numpy array training external input with
shape: # examples x # time steps x # external input dims
Returns:
A dictionary with the averaged outputs of the model decoder, namely:
prior g0 mean, prior g0 variance, approx. posterior mean, approx
posterior mean, the generator initial conditions, the control inputs (if
enabled), the state of the generator, the factors, and the output
distribution parameters, e.g. (rates or mean and variances).
"""
hps = self.hps
batch_size = hps.batch_size
E, T, D = data_extxd.shape
E_to_process = hps.ps_nexamples_to_process
if E_to_process > E:
E_to_process = E
if hps.ic_dim > 0:
prior_g0_mean = np.zeros([E_to_process, hps.ic_dim])
prior_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
post_g0_mean = np.zeros([E_to_process, hps.ic_dim])
post_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
if hps.co_dim > 0:
controller_outputs = np.zeros([E_to_process, T, hps.co_dim])
gen_ics = np.zeros([E_to_process, hps.gen_dim])
gen_states = np.zeros([E_to_process, T, hps.gen_dim])
factors = np.zeros([E_to_process, T, hps.factors_dim])
if hps.output_dist == 'poisson':
out_dist_params = np.zeros([E_to_process, T, D])
elif hps.output_dist == 'gaussian':
out_dist_params = np.zeros([E_to_process, T, D+D])
else:
assert False, "NIY"
costs = np.zeros(E_to_process)
nll_bound_vaes = np.zeros(E_to_process)
nll_bound_iwaes = np.zeros(E_to_process)
train_steps = np.zeros(E_to_process)
for es_idx in range(E_to_process):
print("Running %d of %d." % (es_idx+1, E_to_process))
example_idxs = es_idx * np.ones(batch_size, dtype=np.int32)
data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd,
ext_input_extxi,
batch_size=batch_size,
example_idxs=example_idxs)
model_values = self.eval_model_runs_batch(data_name, data_bxtxd,
ext_input_bxtxi,
do_eval_cost=True,
do_average_batch=True)
if self.hps.ic_dim > 0:
prior_g0_mean[es_idx,:] = model_values['prior_g0_mean']
prior_g0_logvar[es_idx,:] = model_values['prior_g0_logvar']
post_g0_mean[es_idx,:] = model_values['post_g0_mean']
post_g0_logvar[es_idx,:] = model_values['post_g0_logvar']
gen_ics[es_idx,:] = model_values['gen_ics']
if self.hps.co_dim > 0:
controller_outputs[es_idx,:,:] = model_values['controller_outputs']
gen_states[es_idx,:,:] = model_values['gen_states']
factors[es_idx,:,:] = model_values['factors']
out_dist_params[es_idx,:,:] = model_values['output_dist_params']
costs[es_idx] = model_values['costs']
nll_bound_vaes[es_idx] = model_values['nll_bound_vaes']
nll_bound_iwaes[es_idx] = model_values['nll_bound_iwaes']
train_steps[es_idx] = model_values['train_steps']
print('bound nll(vae): %.3f, bound nll(iwae): %.3f' \
% (nll_bound_vaes[es_idx], nll_bound_iwaes[es_idx]))
model_runs = {}
if self.hps.ic_dim > 0:
model_runs['prior_g0_mean'] = prior_g0_mean
model_runs['prior_g0_logvar'] = prior_g0_logvar
model_runs['post_g0_mean'] = post_g0_mean
model_runs['post_g0_logvar'] = post_g0_logvar
model_runs['gen_ics'] = gen_ics
if self.hps.co_dim > 0:
model_runs['controller_outputs'] = controller_outputs
model_runs['gen_states'] = gen_states
model_runs['factors'] = factors
model_runs['output_dist_params'] = out_dist_params
model_runs['costs'] = costs
model_runs['nll_bound_vaes'] = nll_bound_vaes
model_runs['nll_bound_iwaes'] = nll_bound_iwaes
model_runs['train_steps'] = train_steps
return model_runs
def eval_model_runs_push_mean(self, data_name, data_extxd,
ext_input_extxi=None):
"""Returns values of interest for the model by pushing the means through
The mean values for both initial conditions and the control inputs are
pushed through the model instead of sampling (as is done in
eval_model_runs_avg_epoch).
This is a quick and approximate version of estimating these values instead
of sampling from the posterior many times and then averaging those values of
interest.
Internally, a total of batch_size trials are run through the model at once.
Args:
data_name: The name of the data dict, to select which in/out matrices
to use.
data_extxd: Numpy array training data with shape:
# examples x # time steps x # dimensions
ext_input_extxi (optional): Numpy array training external input with
shape: # examples x # time steps x # external input dims
Returns:
A dictionary with the estimated outputs of the model decoder, namely:
prior g0 mean, prior g0 variance, approx. posterior mean, approx
posterior mean, the generator initial conditions, the control inputs (if
enabled), the state of the generator, the factors, and the output
distribution parameters, e.g. (rates or mean and variances).
"""
hps = self.hps
batch_size = hps.batch_size
E, T, D = data_extxd.shape
E_to_process = hps.ps_nexamples_to_process
if E_to_process > E:
print("Setting number of posterior samples to process to : ", E)
E_to_process = E
if hps.ic_dim > 0:
prior_g0_mean = np.zeros([E_to_process, hps.ic_dim])
prior_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
post_g0_mean = np.zeros([E_to_process, hps.ic_dim])
post_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
if hps.co_dim > 0:
controller_outputs = np.zeros([E_to_process, T, hps.co_dim])
gen_ics = np.zeros([E_to_process, hps.gen_dim])
gen_states = np.zeros([E_to_process, T, hps.gen_dim])
factors = np.zeros([E_to_process, T, hps.factors_dim])
if hps.output_dist == 'poisson':
out_dist_params = np.zeros([E_to_process, T, D])
elif hps.output_dist == 'gaussian':
out_dist_params = np.zeros([E_to_process, T, D+D])
else:
assert False, "NIY"
costs = np.zeros(E_to_process)
nll_bound_vaes = np.zeros(E_to_process)
nll_bound_iwaes = np.zeros(E_to_process)
train_steps = np.zeros(E_to_process)
# generator that will yield 0:N in groups of per items, e.g.
# (0:per-1), (per:2*per-1), ..., with the last group containing <= per items
# this will be used to feed per=batch_size trials into the model at a time
def trial_batches(N, per):
for i in range(0, N, per):
yield np.arange(i, min(i+per, N), dtype=np.int32)
for batch_idx, es_idx in enumerate(trial_batches(E_to_process,
hps.batch_size)):
print("Running trial batch %d with %d trials" % (batch_idx+1,
len(es_idx)))
data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd,
ext_input_extxi,
batch_size=batch_size,
example_idxs=es_idx)
model_values = self.eval_model_runs_batch(data_name, data_bxtxd,
ext_input_bxtxi,
do_eval_cost=True,
do_average_batch=False)
if self.hps.ic_dim > 0:
prior_g0_mean[es_idx,:] = model_values['prior_g0_mean']
prior_g0_logvar[es_idx,:] = model_values['prior_g0_logvar']
post_g0_mean[es_idx,:] = model_values['post_g0_mean']
post_g0_logvar[es_idx,:] = model_values['post_g0_logvar']
gen_ics[es_idx,:] = model_values['gen_ics']
if self.hps.co_dim > 0:
controller_outputs[es_idx,:,:] = model_values['controller_outputs']
gen_states[es_idx,:,:] = model_values['gen_states']
factors[es_idx,:,:] = model_values['factors']
out_dist_params[es_idx,:,:] = model_values['output_dist_params']
# TODO
# model_values['costs'] and other costs come out as scalars, summed over
# all the trials in the batch. what we want is the per-trial costs
costs[es_idx] = model_values['costs']
nll_bound_vaes[es_idx] = model_values['nll_bound_vaes']
nll_bound_iwaes[es_idx] = model_values['nll_bound_iwaes']
train_steps[es_idx] = model_values['train_steps']
model_runs = {}
if self.hps.ic_dim > 0:
model_runs['prior_g0_mean'] = prior_g0_mean
model_runs['prior_g0_logvar'] = prior_g0_logvar
model_runs['post_g0_mean'] = post_g0_mean
model_runs['post_g0_logvar'] = post_g0_logvar
model_runs['gen_ics'] = gen_ics
if self.hps.co_dim > 0:
model_runs['controller_outputs'] = controller_outputs
model_runs['gen_states'] = gen_states
model_runs['factors'] = factors
model_runs['output_dist_params'] = out_dist_params
# You probably do not want the LL associated values when pushing the mean
# instead of sampling.
model_runs['costs'] = costs
model_runs['nll_bound_vaes'] = nll_bound_vaes
model_runs['nll_bound_iwaes'] = nll_bound_iwaes
model_runs['train_steps'] = train_steps
return model_runs
def write_model_runs(self, datasets, output_fname=None, push_mean=False):
"""Run the model on the data in data_dict, and save the computed values.
LFADS generates a number of outputs for each examples, and these are all
saved. They are:
The mean and variance of the prior of g0.
The mean and variance of approximate posterior of g0.
The control inputs (if enabled)
The initial conditions, g0, for all examples.
The generator states for all time.
The factors for all time.
The output distribution parameters (e.g. rates) for all time.
Args:
datasets: a dictionary of named data_dictionaries, see top of lfads.py
output_fname: a file name stem for the output files.
push_mean: if False (default), generates batch_size samples for each trial
and averages the results. if True, runs each trial once without noise,
pushing the posterior mean initial conditions and control inputs through
the trained model. False is used for posterior_sample_and_average, True
is used for posterior_push_mean.
"""
hps = self.hps
kind = hps.kind
for data_name, data_dict in datasets.items():
data_tuple = [('train', data_dict['train_data'],
data_dict['train_ext_input']),
('valid', data_dict['valid_data'],
data_dict['valid_ext_input'])]
for data_kind, data_extxd, ext_input_extxi in data_tuple:
if not output_fname:
fname = "model_runs_" + data_name + '_' + data_kind + '_' + kind
else:
fname = output_fname + data_name + '_' + data_kind + '_' + kind
print("Writing data for %s data and kind %s." % (data_name, data_kind))
if push_mean:
model_runs = self.eval_model_runs_push_mean(data_name, data_extxd,
ext_input_extxi)
else:
model_runs = self.eval_model_runs_avg_epoch(data_name, data_extxd,
ext_input_extxi)
full_fname = os.path.join(hps.lfads_save_dir, fname)
write_data(full_fname, model_runs, compression='gzip')
print("Done.")
def write_model_samples(self, dataset_name, output_fname=None):
"""Use the prior distribution to generate batch_size number of samples
from the model.
LFADS generates a number of outputs for each sample, and these are all
saved. They are:
The mean and variance of the prior of g0.
The control inputs (if enabled)
The initial conditions, g0, for all examples.
The generator states for all time.
The factors for all time.
The output distribution parameters (e.g. rates) for all time.
Args:
dataset_name: The name of the dataset to grab the factors -> rates
alignment matrices from.
output_fname: The name of the file in which to save the generated
samples.
"""
hps = self.hps
batch_size = hps.batch_size
print("Generating %d samples" % (batch_size))
tf_vals = [self.factors, self.gen_states, self.gen_ics,
self.cost, self.output_dist_params]
if hps.ic_dim > 0:
tf_vals += [self.prior_zs_g0.mean, self.prior_zs_g0.logvar]
if hps.co_dim > 0:
tf_vals += [self.prior_zs_ar_con.samples_t]
tf_vals_flat, fidxs = flatten(tf_vals)
session = tf.get_default_session()
feed_dict = {}
feed_dict[self.dataName] = dataset_name
feed_dict[self.keep_prob] = 1.0
np_vals_flat = session.run(tf_vals_flat, feed_dict=feed_dict)
ff = 0
factors = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
gen_states = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
gen_ics = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
costs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
output_dist_params = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
if hps.ic_dim > 0:
prior_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
prior_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
if hps.co_dim > 0:
prior_zs_ar_con = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
# [0] are to take out the non-temporal items from lists
gen_ics = gen_ics[0]
costs = costs[0]
# Convert to full tensors, not lists of tensors in time dim.
gen_states = list_t_bxn_to_tensor_bxtxn(gen_states)
factors = list_t_bxn_to_tensor_bxtxn(factors)
output_dist_params = list_t_bxn_to_tensor_bxtxn(output_dist_params)
if hps.ic_dim > 0:
prior_g0_mean = prior_g0_mean[0]
prior_g0_logvar = prior_g0_logvar[0]
if hps.co_dim > 0:
prior_zs_ar_con = list_t_bxn_to_tensor_bxtxn(prior_zs_ar_con)
model_vals = {}
model_vals['gen_ics'] = gen_ics
model_vals['gen_states'] = gen_states
model_vals['factors'] = factors
model_vals['output_dist_params'] = output_dist_params
model_vals['costs'] = costs.reshape(1)
if hps.ic_dim > 0:
model_vals['prior_g0_mean'] = prior_g0_mean
model_vals['prior_g0_logvar'] = prior_g0_logvar
if hps.co_dim > 0:
model_vals['prior_zs_ar_con'] = prior_zs_ar_con
full_fname = os.path.join(hps.lfads_save_dir, output_fname)
write_data(full_fname, model_vals, compression='gzip')
print("Done.")
@staticmethod
def eval_model_parameters(use_nested=True, include_strs=None):
"""Evaluate and return all of the TF variables in the model.
Args:
use_nested (optional): For returning values, use a nested dictoinary, based
on variable scoping, or return all variables in a flat dictionary.
include_strs (optional): A list of strings to use as a filter, to reduce the
number of variables returned. A variable name must contain at least one
string in include_strs as a sub-string in order to be returned.
Returns:
The parameters of the model. This can be in a flat
dictionary, or a nested dictionary, where the nesting is by variable
scope.
"""
all_tf_vars = tf.global_variables()
session = tf.get_default_session()
all_tf_vars_eval = session.run(all_tf_vars)
vars_dict = {}
strs = ["LFADS"]
if include_strs:
strs += include_strs
for i, (var, var_eval) in enumerate(zip(all_tf_vars, all_tf_vars_eval)):
if any(s in include_strs for s in var.name):
if not isinstance(var_eval, np.ndarray): # for H5PY
print(var.name, """ is not numpy array, saving as numpy array
with value: """, var_eval, type(var_eval))
e = np.array(var_eval)
print(e, type(e))
else:
e = var_eval
vars_dict[var.name] = e
if not use_nested:
return vars_dict
var_names = vars_dict.keys()
nested_vars_dict = {}
current_dict = nested_vars_dict
for v, var_name in enumerate(var_names):
var_split_name_list = var_name.split('/')
split_name_list_len = len(var_split_name_list)
current_dict = nested_vars_dict
for p, part in enumerate(var_split_name_list):
if p < split_name_list_len - 1:
if part in current_dict:
current_dict = current_dict[part]
else:
current_dict[part] = {}
current_dict = current_dict[part]
else:
current_dict[part] = vars_dict[var_name]
return nested_vars_dict
@staticmethod
def spikify_rates(rates_bxtxd):
"""Randomly spikify underlying rates according a Poisson distribution
Args:
rates_bxtxd: a numpy tensor with shape:
Returns:
A numpy array with the same shape as rates_bxtxd, but with the event
counts.
"""
B,T,N = rates_bxtxd.shape
assert all([B > 0, N > 0]), "problems"
# Because the rates are changing, there is nesting
spikes_bxtxd = np.zeros([B,T,N], dtype=np.int32)
for b in range(B):
for t in range(T):
for n in range(N):
rate = rates_bxtxd[b,t,n]
count = np.random.poisson(rate)
spikes_bxtxd[b,t,n] = count
return spikes_bxtxd