|
import math |
|
import argparse |
|
import random |
|
|
|
import torch |
|
from torch import nn |
|
from torch.optim.lr_scheduler import LambdaLR |
|
|
|
|
|
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): |
|
""" Create a schedule with a learning rate that decreases following the |
|
values of the cosine function between 0 and `pi * cycles` after a warmup |
|
period during which it increases linearly between 0 and 1. |
|
""" |
|
|
|
def lr_lambda(current_step): |
|
if current_step < num_warmup_steps: |
|
return float(current_step) / float(max(1, num_warmup_steps)) |
|
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) |
|
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) |
|
|
|
return LambdaLR(optimizer, lr_lambda, last_epoch) |
|
|
|
|
|
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): |
|
""" |
|
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after |
|
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. |
|
|
|
Args: |
|
optimizer (:class:`~torch.optim.Optimizer`): |
|
The optimizer for which to schedule the learning rate. |
|
num_warmup_steps (:obj:`int`): |
|
The number of steps for the warmup phase. |
|
num_training_steps (:obj:`int`): |
|
The total number of training steps. |
|
last_epoch (:obj:`int`, `optional`, defaults to -1): |
|
The index of the last epoch when resuming training. |
|
|
|
Return: |
|
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. |
|
""" |
|
|
|
def lr_lambda(current_step: int): |
|
if current_step < num_warmup_steps: |
|
return float(current_step) / float(max(1, num_warmup_steps)) |
|
return max( |
|
0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) |
|
) |
|
|
|
return LambdaLR(optimizer, lr_lambda, last_epoch) |
|
|
|
|
|
def get_openai_lr(transformer_model): |
|
num_params = sum(p.numel() for p in transformer_model.parameters()) |
|
return 0.003239 - 0.0001395 * math.log(num_params) |
|
|
|
|
|
def get_weighted_single_eval_pos_sampler(max_len): |
|
""" |
|
This gives a sampler that can be used for `single_eval_pos` which yields good performance for all positions p, |
|
where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer. |
|
:return: Sampler that can be fed to `train()` as `single_eval_pos_gen`. |
|
""" |
|
return lambda: random.choices(range(max_len), [1 / (max_len - i) for i in range(max_len)])[0] |
|
|
|
|
|
def get_uniform_single_eval_pos_sampler(max_len): |
|
""" |
|
Just sample any evaluation position with the same weight |
|
:return: Sampler that can be fed to `train()` as `single_eval_pos_gen`. |
|
""" |
|
return lambda: random.choices(range(max_len))[0] |
|
|
|
|
|
class SeqBN(nn.Module): |
|
def __init__(self, d_model): |
|
super().__init__() |
|
self.bn = nn.BatchNorm1d(d_model) |
|
self.d_model = d_model |
|
|
|
def forward(self, x): |
|
assert self.d_model == x.shape[-1] |
|
flat_x = x.view(-1, self.d_model) |
|
flat_x = self.bn(flat_x) |
|
return flat_x.view(*x.shape) |
|
|
|
|
|
def set_locals_in_self(locals): |
|
self = locals['self'] |
|
for var_name, val in locals.items(): |
|
if var_name != 'self': setattr(self, var_name, val) |
|
|
|
|
|
default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0' |
|
|
|
|
|
|
|
class StoreDictKeyPair(argparse.Action): |
|
def __init__(self, option_strings, dest, nargs=None, **kwargs): |
|
self._nargs = nargs |
|
super(StoreDictKeyPair, self).__init__(option_strings, dest, nargs=nargs, **kwargs) |
|
|
|
def __call__(self, parser, namespace, values, option_string=None): |
|
my_dict = {} |
|
for kv in values: |
|
k, v = kv.split("=") |
|
try: |
|
my_dict[k] = eval(v) |
|
except NameError: |
|
my_dict[k] = v |
|
setattr(namespace, self.dest, my_dict) |
|
print("dict values: {}".format(my_dict)) |
|
|
|
|
|
|