|
from tensorflow_addons.utils import types |
|
from typeguard import typechecked |
|
import tensorflow as tf |
|
import numpy as np |
|
import pickle |
|
|
|
def splitListIntoChunks(data, numChunks): |
|
chunkSize = int(len(data) / numChunks) |
|
chunks = [] |
|
for i in range(numChunks - 1): |
|
start, end = i * chunkSize, (i + 1) * chunkSize |
|
chunks.append(data[start:end]) |
|
|
|
chunks.append(data[end:]) |
|
return chunks |
|
|
|
|
|
def splitIntoValueChunks(data, numChunks, getValueFunc): |
|
values = [getValueFunc(d) for d in data] |
|
minValue, maxValue = np.min(values), np.max(values) |
|
chunkSize = (maxValue - minValue) / float(numChunks) |
|
|
|
data.sort(key=lambda x: getValueFunc(x)) |
|
sizeCeil = minValue + chunkSize |
|
chunks, currentChunkIndex = [[]], 0 |
|
for d in data: |
|
v = getValueFunc(d) |
|
while (v > sizeCeil): |
|
chunks.append([]) |
|
sizeCeil += chunkSize |
|
currentChunkIndex += 1 |
|
chunks[currentChunkIndex].append(d) |
|
|
|
return chunks |
|
|
|
|
|
def startGraphLogging(): |
|
from datetime import datetime |
|
stamp = datetime.now().strftime("%Y%m%d-%H%M%S") |
|
logdir = 'logs/func/%s' % stamp |
|
writer = tf.summary.create_file_writer(logdir) |
|
tf.summary.trace_on(graph=True, profiler=True) |
|
return writer, logdir |
|
|
|
|
|
def finishGraphLogging(writer, logdir): |
|
with writer.as_default(): |
|
tf.summary.trace_export( |
|
name="my_func_trace", |
|
step=0, |
|
profiler_outdir=logdir) |
|
|
|
|
|
class CustomSaveCallBack(tf.keras.callbacks.Callback): |
|
|
|
def __init__(self, saveName, saveInterval=10, firstSavePoint=-1): |
|
super().__init__() |
|
self.saveName = saveName |
|
self.saveInterval = saveInterval |
|
self.firstSavePoint = saveInterval if firstSavePoint < 0 else firstSavePoint |
|
self.saveCounter = 0 |
|
|
|
def on_epoch_end(self, epoch, logs=None): |
|
if (epoch + 1 >= self.firstSavePoint): |
|
if (self.saveCounter % self.saveInterval == 0): |
|
print("Saving model!") |
|
self.model.save_weights(self.saveName.format(epoch + 1)) |
|
|
|
self.saveCounter += 1 |
|
|
|
|
|
def saveTokenizer(base='gpt2', dumpPath='GPT2-Tokenizer.pkl'): |
|
import transformers |
|
tokenizer = transformers.AutoTokenizer.from_pretrained(base) |
|
with open(dumpPath, 'wb') as fp: |
|
pickle.dump(tokenizer, fp) |
|
|
|
|
|
def loadTokenizer(dumpPath='GPT2-Tokenizer.pkl'): |
|
with open(dumpPath, 'rb') as fp: |
|
return pickle.load(fp) |
|
|
|
|
|
class GradientAccumulator(tf.keras.optimizers.Optimizer): |
|
"""Optimizer wrapper for gradient accumulation.""" |
|
|
|
@typechecked |
|
def __init__( |
|
self, |
|
inner_optimizer: types.Optimizer, |
|
accum_steps: types.TensorLike = 4, |
|
name: str = "GradientAccumulator", |
|
**kwargs, |
|
): |
|
r"""Construct a new GradientAccumulator optimizer. |
|
Args: |
|
inner_optimizer: str or `tf.keras.optimizers.Optimizer` that will be |
|
used to compute and apply gradients. |
|
accum_steps: int > 0. Update gradient in every accumulation steps. |
|
name: Optional name for the operations created when applying |
|
gradients. Defaults to "GradientAccumulator". |
|
**kwargs: keyword arguments. Allowed to be {`clipnorm`, |
|
`clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by |
|
norm; `clipvalue` is clip gradients by value, `decay` is |
|
included for backward compatibility to allow time inverse |
|
decay of learning rate. `lr` is included for backward |
|
compatibility, recommended to use `learning_rate` instead. |
|
""" |
|
super().__init__(name, **kwargs) |
|
self._optimizer = tf.keras.optimizers.get(inner_optimizer) |
|
self._gradients = [] |
|
self._accum_steps = accum_steps |
|
self._step = None |
|
self._iterations = self._optimizer.iterations |
|
|
|
def _create_slots(self, var_list): |
|
self._optimizer._create_slots(var_list=var_list) |
|
for var in var_list: |
|
self.add_slot(var, "ga") |
|
|
|
self._gradients = [self.get_slot(var, "ga") for var in var_list] |
|
|
|
@property |
|
def step(self): |
|
"""Variable. The number of training steps this Optimizer has run.""" |
|
if self._step is None: |
|
with self._distribution_strategy_scope(): |
|
self._step = self.add_weight( |
|
"iter", |
|
shape=[], |
|
initializer="ones", |
|
dtype=tf.int64, |
|
trainable=False, |
|
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, |
|
) |
|
self._weights.append(self._step) |
|
return self._step |
|
|
|
@step.setter |
|
def step(self, variable): |
|
if self._step is not None: |
|
raise RuntimeError( |
|
"Cannot set `step` to a new Variable after " |
|
"the Optimizer weights have been created" |
|
) |
|
self._step = variable |
|
self._weights.append(self._step) |
|
|
|
@property |
|
def gradients(self): |
|
"""The accumulated gradients on the current replica.""" |
|
if not self._gradients: |
|
raise ValueError( |
|
"The accumulator should be called first to initialize the gradients" |
|
) |
|
return list( |
|
gradient.read_value() if gradient is not None else gradient |
|
for gradient in self._gradients |
|
) |
|
|
|
def apply_gradients(self, grads_and_vars, name=None, **kwargs): |
|
train_op = super().apply_gradients(grads_and_vars, name, **kwargs) |
|
with tf.control_dependencies([train_op]): |
|
with tf.control_dependencies( |
|
[ |
|
self._optimizer.iterations.assign_add( |
|
tf.cast( |
|
tf.where(self.step % self._accum_steps == 0, 1, 0), tf.int64 |
|
), |
|
read_value=False, |
|
) |
|
] |
|
): |
|
return self.step.assign_add(1, read_value=False) |
|
|
|
def _resource_apply_dense(self, grad, var, apply_state=None): |
|
accum_gradient = self.get_slot(var, "ga") |
|
if accum_gradient is not None and grad is not None: |
|
accum_gradient.assign_add( |
|
grad, use_locking=self._use_locking, read_value=False |
|
) |
|
|
|
return self._apply_grad(accum_gradient, var, apply_state) |
|
|
|
def _resource_apply_sparse(self, grad: types.TensorLike, var, indices, apply_state): |
|
accum_gradient = self.get_slot(var, "ga") |
|
if accum_gradient is not None and grad is not None: |
|
self._resource_scatter_add(accum_gradient, indices, grad) |
|
|
|
return self._apply_grad(accum_gradient, var, apply_state) |
|
|
|
def _apply_grad(self, accum_gradient, var, apply_state): |
|
grad = tf.where( |
|
self.step % self._accum_steps == 0, |
|
accum_gradient, |
|
tf.zeros_like(var), |
|
) |
|
if "apply_state" in self._optimizer._dense_apply_args: |
|
train_op = self._optimizer._resource_apply_dense( |
|
grad, |
|
var, |
|
apply_state=apply_state, |
|
) |
|
else: |
|
train_op = self._optimizer._resource_apply_dense(grad, var) |
|
reset_val = tf.where( |
|
grad == accum_gradient, tf.zeros_like(accum_gradient), accum_gradient |
|
) |
|
reset_op = accum_gradient.assign( |
|
reset_val, |
|
use_locking=self._use_locking, |
|
read_value=False, |
|
) |
|
|
|
return tf.group(train_op, reset_op) |
|
|
|
def reset(self): |
|
"""Resets the accumulated gradients on the current replica.""" |
|
assign_ops = [] |
|
if not self._gradients: |
|
return assign_ops |
|
|
|
for gradient in self._gradients: |
|
if gradient is not None: |
|
assign_ops.append( |
|
gradient.assign( |
|
tf.zeros_like(gradient), |
|
use_locking=self._use_locking, |
|
read_value=False, |
|
) |
|
) |
|
|
|
return tf.group(assign_ops) |
|
|
|
@property |
|
def inner_optimizer(self): |
|
"""The optimizer that this LossScaleOptimizer is wrapping.""" |
|
return self._optimizer |
|
|
|
@property |
|
def iterations(self): |
|
return self._optimizer.iterations |
|
|
|
@iterations.setter |
|
def iterations(self, variable): |
|
self._optimizer.iterations = variable |
|
|
|
@property |
|
def lr(self): |
|
return self._optimizer._get_hyper("learning_rate") |
|
|
|
@lr.setter |
|
def lr(self, lr): |
|
self._optimizer._set_hyper("learning_rate", lr) |
|
|
|
@property |
|
def learning_rate(self): |
|
return self._optimizer._get_hyper("learning_rate") |
|
|
|
@learning_rate.setter |
|
def learning_rate(self, learning_rate): |
|
self._optimizer._set_hyper("learning_rate", learning_rate) |
|
|
|
def get_config(self): |
|
config = { |
|
"accum_steps": self._accum_steps, |
|
"optimizer": tf.keras.optimizers.serialize(self._optimizer), |
|
} |
|
base_config = super().get_config() |
|
return {**base_config, **config} |
|
|
|
@classmethod |
|
def from_config(cls, config, custom_objects=None): |
|
optimizer = tf.keras.optimizers.deserialize( |
|
config.pop("optimizer"), custom_objects=custom_objects |
|
) |
|
return cls(optimizer, **config) |
|
|
|
|