Samuel Mueller
working locally
f50f696
import math
import argparse
import random
import torch
from torch import nn
from torch.optim.lr_scheduler import LambdaLR
# copied from huggingface
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
""" Create a schedule with a learning rate that decreases following the
values of the cosine function between 0 and `pi * cycles` after a warmup
period during which it increases linearly between 0 and 1.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
return LambdaLR(optimizer, lr_lambda, last_epoch)
# copied from huggingface
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
num_training_steps (:obj:`int`):
The total number of training steps.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(
0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_openai_lr(transformer_model):
num_params = sum(p.numel() for p in transformer_model.parameters())
return 0.003239 - 0.0001395 * math.log(num_params)
def get_weighted_single_eval_pos_sampler(max_len):
"""
This gives a sampler that can be used for `single_eval_pos` which yields good performance for all positions p,
where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
:return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
"""
return lambda: random.choices(range(max_len), [1 / (max_len - i) for i in range(max_len)])[0]
def get_uniform_single_eval_pos_sampler(max_len):
"""
Just sample any evaluation position with the same weight
:return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
"""
return lambda: random.choices(range(max_len))[0]
class SeqBN(nn.Module):
def __init__(self, d_model):
super().__init__()
self.bn = nn.BatchNorm1d(d_model)
self.d_model = d_model
def forward(self, x):
assert self.d_model == x.shape[-1]
flat_x = x.view(-1, self.d_model)
flat_x = self.bn(flat_x)
return flat_x.view(*x.shape)
def set_locals_in_self(locals):
self = locals['self']
for var_name, val in locals.items():
if var_name != 'self': setattr(self, var_name, val)
default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
# Copied from StackOverflow, but we do an eval on the values additionally
class StoreDictKeyPair(argparse.Action):
def __init__(self, option_strings, dest, nargs=None, **kwargs):
self._nargs = nargs
super(StoreDictKeyPair, self).__init__(option_strings, dest, nargs=nargs, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
my_dict = {}
for kv in values:
k, v = kv.split("=")
try:
my_dict[k] = eval(v)
except NameError:
my_dict[k] = v
setattr(namespace, self.dest, my_dict)
print("dict values: {}".format(my_dict))