File size: 5,899 Bytes
e6b8f5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer.

# https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
# and/or
# https://github.com/lessw2020/Best-Deep-Learning-Optimizers

# Ranger has now been used to capture 12 records on the FastAI leaderboard.

# This version = 20.4.11

# Credits:
# Gradient Centralization --> https://arxiv.org/abs/2004.01461v2 (a new optimization technique for DNNs), github:  https://github.com/Yonghongwei/Gradient-Centralization
# RAdam -->  https://github.com/LiyuanLucasLiu/RAdam
# Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code.
# Lookahead paper --> MZhang,G Hinton  https://arxiv.org/abs/1907.08610

# summary of changes:
# 4/11/20 - add gradient centralization option.  Set new testing benchmark for accuracy with it, toggle with use_gc flag at init.
# full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights),
# supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues.
# changes 8/31/19 - fix references to *self*.N_sma_threshold;
# changed eps to 1e-5 as better default than 1e-8.

import math
import torch
from torch.optim.optimizer import Optimizer


class Ranger(Optimizer):

	def __init__(self, params, lr=1e-3,  # lr
				 alpha=0.5, k=6, N_sma_threshhold=5,  # Ranger options
				 betas=(.95, 0.999), eps=1e-5, weight_decay=0,  # Adam options
				 use_gc=True, gc_conv_only=False
				 # Gradient centralization on or off, applied to conv layers only or conv + fc layers
				 ):

		# parameter checks
		if not 0.0 <= alpha <= 1.0:
			raise ValueError(f'Invalid slow update rate: {alpha}')
		if not 1 <= k:
			raise ValueError(f'Invalid lookahead steps: {k}')
		if not lr > 0:
			raise ValueError(f'Invalid Learning Rate: {lr}')
		if not eps > 0:
			raise ValueError(f'Invalid eps: {eps}')

		# parameter comments:
		# beta1 (momentum) of .95 seems to work better than .90...
		# N_sma_threshold of 5 seems better in testing than 4.
		# In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.

		# prep defaults and init torch.optim base
		defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold,
						eps=eps, weight_decay=weight_decay)
		super().__init__(params, defaults)

		# adjustable threshold
		self.N_sma_threshhold = N_sma_threshhold

		# look ahead params

		self.alpha = alpha
		self.k = k

		# radam buffer for state
		self.radam_buffer = [[None, None, None] for ind in range(10)]

		# gc on or off
		self.use_gc = use_gc

		# level of gradient centralization
		self.gc_gradient_threshold = 3 if gc_conv_only else 1

	def __setstate__(self, state):
		super(Ranger, self).__setstate__(state)

	def step(self, closure=None):
		loss = None

		# Evaluate averages and grad, update param tensors
		for group in self.param_groups:

			for p in group['params']:
				if p.grad is None:
					continue
				grad = p.grad.data.float()

				if grad.is_sparse:
					raise RuntimeError('Ranger optimizer does not support sparse gradients')

				p_data_fp32 = p.data.float()

				state = self.state[p]  # get state dict for this param

				if len(state) == 0:  # if first time to run...init dictionary with our desired entries
					# if self.first_run_check==0:
					# self.first_run_check=1
					# print("Initializing slow buffer...should not see this at load from saved model!")
					state['step'] = 0
					state['exp_avg'] = torch.zeros_like(p_data_fp32)
					state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)

					# look ahead weight storage now in state dict
					state['slow_buffer'] = torch.empty_like(p.data)
					state['slow_buffer'].copy_(p.data)

				else:
					state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
					state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

				# begin computations
				exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
				beta1, beta2 = group['betas']

				# GC operation for Conv layers and FC layers
				if grad.dim() > self.gc_gradient_threshold:
					grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))

				state['step'] += 1

				# compute variance mov avg
				exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
				# compute mean moving avg
				exp_avg.mul_(beta1).add_(1 - beta1, grad)

				buffered = self.radam_buffer[int(state['step'] % 10)]

				if state['step'] == buffered[0]:
					N_sma, step_size = buffered[1], buffered[2]
				else:
					buffered[0] = state['step']
					beta2_t = beta2 ** state['step']
					N_sma_max = 2 / (1 - beta2) - 1
					N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
					buffered[1] = N_sma
					if N_sma > self.N_sma_threshhold:
						step_size = math.sqrt(
							(1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
										N_sma_max - 2)) / (1 - beta1 ** state['step'])
					else:
						step_size = 1.0 / (1 - beta1 ** state['step'])
					buffered[2] = step_size

				if group['weight_decay'] != 0:
					p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

				# apply lr
				if N_sma > self.N_sma_threshhold:
					denom = exp_avg_sq.sqrt().add_(group['eps'])
					p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
				else:
					p_data_fp32.add_(-step_size * group['lr'], exp_avg)

				p.data.copy_(p_data_fp32)

				# integrated look ahead...
				# we do it at the param level instead of group level
				if state['step'] % group['k'] == 0:
					slow_p = state['slow_buffer']  # get access to slow param tensor
					slow_p.add_(self.alpha, p.data - slow_p)  # (fast weights - slow weights) * alpha
					p.data.copy_(slow_p)  # copy interpolated weights to RAdam param tensor

		return loss