File size: 2,831 Bytes
c4c7cee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import math


class WarmupLR:
    """
    Linear Warmup learning rate scheduler. After warmup, learning rate is
    constant.

    Args:
        optimizer (torch.optim.Optimizer): optimizer
        warmup_steps (int): number of warmup steps

    """

    def __init__(self, optimizer, warmup_steps):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.base_lr = None

    def get_lr(self, lr, step):
        return lr * min(step / max(self.warmup_steps, 1), 1.0)

    def step(self, step):
        if self.base_lr is None:
            self.base_lr = [
                param_group["lr"] for param_group in self.optimizer.param_groups
            ]
        for param_group, base_lr_group in zip(
            self.optimizer.param_groups, self.base_lr
        ):
            param_group["lr"] = self.get_lr(base_lr_group, step)

    def state_dict(self):
        return {
            key: value for key, value in self.__dict__.items() if key != "optimizer"
        }

    def load_state_dict(self, state_dict):
        self.__dict__.update(state_dict)


class WarmupCosineDecayLR:
    """
    Linear Warmup learning rate scheduler. After warmup, learning rate is
    constant.
    After warmup, learning rate follows a cosine decay.

    Args:
        optimizer (torch.optim.Optimizer): optimizer
        warmup_steps (int): number of warmup steps
        total_steps (int): total number of steps
        rate (float): cosine decay rate
    """

    def __init__(self, optimizer, warmup_steps, total_steps, rate=1.0):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.base_lr = None
        self.total_steps = total_steps
        self.rate = rate

    def get_lr(self, lr, step):
        if step < self.warmup_steps:
            return lr * min(step / max(self.warmup_steps, 1), 1.0)
        else:
            return (
                0.5
                * lr
                * (
                    1
                    + math.cos(
                        self.rate
                        * math.pi
                        * (step - self.warmup_steps)
                        / (self.total_steps - self.warmup_steps)
                    )
                )
            )

    def step(self, step):
        if self.base_lr is None:
            self.base_lr = [
                param_group["lr"] for param_group in self.optimizer.param_groups
            ]
        for param_group, base_lr_group in zip(
            self.optimizer.param_groups, self.base_lr
        ):
            param_group["lr"] = self.get_lr(base_lr_group, step)

    def state_dict(self):
        return {
            key: value for key, value in self.__dict__.items() if key != "optimizer"
        }

    def load_state_dict(self, state_dict):
        self.__dict__.update(state_dict)