diff options
Diffstat (limited to 'supervised/optimizers.py')
-rw-r--r-- | supervised/optimizers.py | 70 |
1 files changed, 0 insertions, 70 deletions
diff --git a/supervised/optimizers.py b/supervised/optimizers.py deleted file mode 100644 index 1904e8d..0000000 --- a/supervised/optimizers.py +++ /dev/null @@ -1,70 +0,0 @@ -import torch - - -class LARS(object): - """ - Slight modification of LARC optimizer from https://github.com/NVIDIA/apex/blob/d74fda260c403f775817470d87f810f816f3d615/apex/parallel/LARC.py - Matches one from SimCLR implementation https://github.com/google-research/simclr/blob/master/lars_optimizer.py - - Args: - optimizer: Pytorch optimizer to wrap and modify learning rate for. - trust_coefficient: Trust coefficient for calculating the adaptive lr. See https://arxiv.org/abs/1708.03888 - """ - - def __init__(self, - optimizer, - trust_coefficient=0.001, - ): - self.param_groups = optimizer.param_groups - self.optim = optimizer - self.trust_coefficient = trust_coefficient - - def __getstate__(self): - return self.optim.__getstate__() - - def __setstate__(self, state): - self.optim.__setstate__(state) - - def __repr__(self): - return self.optim.__repr__() - - def state_dict(self): - return self.optim.state_dict() - - def load_state_dict(self, state_dict): - self.optim.load_state_dict(state_dict) - - def zero_grad(self): - self.optim.zero_grad() - - def add_param_group(self, param_group): - self.optim.add_param_group(param_group) - - def step(self): - with torch.no_grad(): - weight_decays = [] - for group in self.optim.param_groups: - # absorb weight decay control from optimizer - weight_decay = group['weight_decay'] if 'weight_decay' in group else 0 - weight_decays.append(weight_decay) - group['weight_decay'] = 0 - for p in group['params']: - if p.grad is None: - continue - - if weight_decay != 0: - p.grad.data += weight_decay * p.data - - param_norm = torch.norm(p.data) - grad_norm = torch.norm(p.grad.data) - adaptive_lr = 1. - - if param_norm != 0 and grad_norm != 0 and group['layer_adaptation']: - adaptive_lr = self.trust_coefficient * param_norm / grad_norm - - p.grad.data *= adaptive_lr - - self.optim.step() - # return weight decay control to optimizer - for i, group in enumerate(self.optim.param_groups): - group['weight_decay'] = weight_decays[i] |