From 2745f7a55b26d2f9b441770a7a898c691abf4ed6 Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Mon, 25 Dec 2023 13:39:28 +0200 Subject: [PATCH 01/20] updated schedulers and optimizers --- optimizers/__init__.py | 23 +++- optimizers/test_optimers.py | 15 +++ over9000/__init__.py | 14 +++ over9000/adabelief.py | 222 +++++++++++++++++++++++++++++++++++ over9000/adamod.py | 98 ++++++++++++++++ over9000/adan.py | 156 +++++++++++++++++++++++++ over9000/apollo.py | 115 +++++++++++++++++++ over9000/diffgrad.py | 127 ++++++++++++++++++++ over9000/lamb.py | 129 +++++++++++++++++++++ over9000/lookahead.py | 103 +++++++++++++++++ over9000/madam.py | 47 ++++++++ over9000/madgrad.py | 174 ++++++++++++++++++++++++++++ over9000/novograd.py | 223 ++++++++++++++++++++++++++++++++++++ over9000/radam.py | 209 +++++++++++++++++++++++++++++++++ over9000/ralamb.py | 99 ++++++++++++++++ over9000/ranger.py | 12 ++ over9000/rangerlars.py | 14 +++ schedulers/__init__.py | 85 +++++++++++--- schedulers/polylr.py | 3 +- 19 files changed, 1845 insertions(+), 23 deletions(-) create mode 100644 optimizers/test_optimers.py create mode 100644 over9000/__init__.py create mode 100644 over9000/adabelief.py create mode 100644 over9000/adamod.py create mode 100644 over9000/adan.py create mode 100644 over9000/apollo.py create mode 100644 over9000/diffgrad.py create mode 100644 over9000/lamb.py create mode 100644 over9000/lookahead.py create mode 100644 over9000/madam.py create mode 100644 over9000/madgrad.py create mode 100644 over9000/novograd.py create mode 100644 over9000/radam.py create mode 100644 over9000/ralamb.py create mode 100644 over9000/ranger.py create mode 100644 over9000/rangerlars.py diff --git a/optimizers/__init__.py b/optimizers/__init__.py index c07a365..974f8cc 100644 --- a/optimizers/__init__.py +++ b/optimizers/__init__.py @@ -1,6 +1,4 @@ -import torch -import math -from torch.optim import Adam,RMSprop,SGD +from torch.optim import Optimizer,Adam,RMSprop,SGD from .over9000 import * from .adamw import AdamW @@ -19,7 +17,7 @@ 'rangerlars' : RangerLars, ##Known as Over9000 optimizer 'over9000' : RangerLars, ################ - 'lookahead' : Lookahead, + # 'lookahead' : Lookahead, 'lookahead_adam' : LookaheadAdam, 'diffgrad' : DiffGrad, 'adamod' : AdaMod, @@ -30,8 +28,21 @@ 'adan' : Adan } -#optimizers -def get_optimizer(name ,params,*args,**kwargs): + +def get_optimizer(name:str ,params,*args,**kwargs) -> Optimizer: + """ + This function returns the optimizer given its name + + @param name: name of the optimzer + @param params: parameters of the model that need to be optimzed + @param *args & **kwargs: parameters for the optimizer + + @type name:str + @type params: list or dict + + @return: torch.optim.Optimzer + + """ name = name.lower() if name not in optimizer_mapping.keys(): raise ValueError('Optimizer {} not an option'.format(name)) diff --git a/optimizers/test_optimers.py b/optimizers/test_optimers.py new file mode 100644 index 0000000..a2176f3 --- /dev/null +++ b/optimizers/test_optimers.py @@ -0,0 +1,15 @@ +import unittest +import torch + +from __init__ import get_optimizer,optimizer_mapping + + +class test_optimizers(unittest.TestCase): + def test_getoptimzer_return(self): + for optimizer in optimizer_mapping: + net=torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True) + self.assertIs(get_optimizer(optimizer,net.parameters()),torch.optim.Optimizer,f"{optimizer} did not return an optimizer") + + +if __name__=="__main__": + test_optimizers().main() diff --git a/over9000/__init__.py b/over9000/__init__.py new file mode 100644 index 0000000..7af485f --- /dev/null +++ b/over9000/__init__.py @@ -0,0 +1,14 @@ +from .radam import PlainRAdam,RAdam +from .novograd import Novograd +from .ranger import Ranger +from .ralamb import Ralamb +from .rangerlars import RangerLars +from .lookahead import Lookahead,LookaheadAdam +#from .lamb import Lamb +from .diffgrad import DiffGrad +from .adamod import AdaMod +from .madam import Madam +from .apollo import Apollo +from .adabelief import AdaBelief +from .madgrad import MADGRAD +from .adan import Adan \ No newline at end of file diff --git a/over9000/adabelief.py b/over9000/adabelief.py new file mode 100644 index 0000000..2f817d6 --- /dev/null +++ b/over9000/adabelief.py @@ -0,0 +1,222 @@ +# from https://github.com/raw/juntang-zhuang/Adabelief-Optimizer/master/PyTorch_Experiments/AdaBelief.py +import math +import torch +from torch.optim.optimizer import Optimizer +from tabulate import tabulate +from colorama import Fore, Back, Style + +version_higher = ( torch.__version__ >= "1.5.0" ) + +class AdaBelief(Optimizer): + r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-16) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + weight_decouple (boolean, optional): ( default: True) If set as True, then + the optimizer uses decoupled weight decay as in AdamW + fixed_decay (boolean, optional): (default: False) This is used when weight_decouple + is set as True. + When fixed_decay == True, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay$. + When fixed_decay == False, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the + weight decay ratio decreases with learning rate (lr). + rectify (boolean, optional): (default: True) If set as True, then perform the rectified + update similar to RAdam + degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update + when variance of gradient is high + reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020 + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, + weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True, + degenerated_to_sgd=True): + + # ------------------------------------------------------------------------------ + # Print modifications to default arguments + print(Fore.RED + 'Please check your arguments if you have upgraded adabelief-pytorch from version 0.0.5.') + print(Fore.RED + 'Modifications to default arguments:') + default_table = tabulate([ + ['adabelief-pytorch=0.0.5','1e-8','False','False'], + ['Current version (0.1.0)','1e-16','True','True']], + headers=['eps','weight_decouple','rectify']) + print(Fore.RED + default_table) + + print(Fore.RED +'For a complete table of recommended hyperparameters, see') + print(Fore.RED + 'https://github.com/juntang-zhuang/Adabelief-Optimizer') + + print(Style.RESET_ALL) + # ------------------------------------------------------------------------------ + + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)]) + super(AdaBelief, self).__init__(params, defaults) + + self.degenerated_to_sgd = degenerated_to_sgd + self.weight_decouple = weight_decouple + self.rectify = rectify + self.fixed_decay = fixed_decay + if self.weight_decouple: + print('Weight decoupling enabled in AdaBelief') + if self.fixed_decay: + print('Weight decay fixed') + if self.rectify: + print('Rectification enabled in AdaBelief') + if amsgrad: + print('AMSGrad enabled in AdaBelief') + + def __setstate__(self, state): + super(AdaBelief, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def reset(self): + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + amsgrad = group['amsgrad'] + + # State initialization + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'AdaBelief does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + beta1, beta2 = group['betas'] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + # get current state variable + exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # Update first and second moment running average + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + grad_residual = grad - exp_avg + exp_avg_var.mul_(beta2).addcmul_( grad_residual, grad_residual, value=1 - beta2) + + if amsgrad: + max_exp_avg_var = state['max_exp_avg_var'] + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var) + + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + else: + denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + + # perform weight decay, check if decoupled weight decay + if self.weight_decouple: + if not self.fixed_decay: + p.data.mul_(1.0 - group['lr'] * group['weight_decay']) + else: + p.data.mul_(1.0 - group['weight_decay']) + else: + if group['weight_decay'] != 0: + grad.add_(p.data, alpha=group['weight_decay']) + + # update + if not self.rectify: + # Default update + step_size = group['lr'] / bias_correction1 + p.data.addcdiv_( exp_avg, denom, value=-step_size) + + else: # Rectified update, forked from RAdam + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state['step']) + else: + step_size = -1 + buffered[2] = step_size + + if N_sma >= 5: + denom = exp_avg_var.sqrt().add_(group['eps']) + p.data.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) + elif step_size > 0: + p.data.add_( exp_avg, alpha=-step_size * group['lr']) + + return loss \ No newline at end of file diff --git a/over9000/adamod.py b/over9000/adamod.py new file mode 100644 index 0000000..b345560 --- /dev/null +++ b/over9000/adamod.py @@ -0,0 +1,98 @@ +# original repo https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py +import math +import torch +from torch.optim import Optimizer + +class AdaMod(Optimizer): + """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) + It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), beta3=0.999, + eps=1e-8, weight_decay=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= beta3 < 1.0: + raise ValueError("Invalid beta3 parameter: {}".format(beta3)) + defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, + weight_decay=weight_decay) + super(AdaMod, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdaMod, self).__setstate__(state) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'AdaMod does not support sparse gradients') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + # Exponential moving average of actual learning rates + state['exp_avg_lr'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p.data.add_(-group['weight_decay'] * group['lr'], p.data) + + # Applies momental bounds on actual learning rates + step_size = torch.full_like(denom, step_size) + step_size.div_(denom) + exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) + step_size = torch.min(step_size, exp_avg_lr) + step_size.mul_(exp_avg) + + p.data.add_(-step_size) + + return loss diff --git a/over9000/adan.py b/over9000/adan.py new file mode 100644 index 0000000..8206a92 --- /dev/null +++ b/over9000/adan.py @@ -0,0 +1,156 @@ +# https://github.com/raw/sail-sg/Adan/main/adan.py + +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/over9000/apollo.py b/over9000/apollo.py new file mode 100644 index 0000000..2dd8656 --- /dev/null +++ b/over9000/apollo.py @@ -0,0 +1,115 @@ +# from https://github.com/raw/XuezheMax/apollo/master/optim/apollo.py +import torch +from torch.optim.optimizer import Optimizer + + +class Apollo(Optimizer): + r"""Implements Atom algorithm. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float): learning rate + beta (float, optional): coefficient used for computing + running averages of gradient (default: 0.9) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-4) + warmup (int, optional): number of warmup steps (default: 0) + init_lr (float, optional): initial learning rate for warmup (default: 0.01) + weight_decay (float, optional): weight decay coefficient (default: 0) + + """ + + def __init__(self, params, lr, beta=0.9, eps=1e-4, warmup=100, init_lr=0.01, weight_decay=0): + if not 0.0 < lr: + raise ValueError("Invalid learning rate value: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= beta < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(beta)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= warmup: + raise ValueError("Invalid warmup updates: {}".format(warmup)) + if not 0.0 <= init_lr <= 1.0: + raise ValueError("Invalid initial learning rate: {}".format(init_lr)) + + defaults = dict(lr=lr, beta=beta, eps=eps, warmup=warmup, + init_lr=init_lr, base_lr=lr, weight_decay=weight_decay) + super(Apollo, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Apollo, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg_grad'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['approx_hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Previous update direction + state['update'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + # Calculate current lr + if state['step'] < group['warmup']: + curr_lr = (group['base_lr'] - group['init_lr']) * state['step'] / group['warmup'] + group['init_lr'] + else: + curr_lr = group['lr'] + + # Perform optimization step + grad = p.grad + if grad.is_sparse: + raise RuntimeError('Atom does not support sparse gradients.') + + # Perform step weight decay + if group['weight_decay'] != 0: + grad = grad.add(p, alpha=group['weight_decay']) + + beta = group['beta'] + exp_avg_grad = state['exp_avg_grad'] + B = state['approx_hessian'] + d_p = state['update'] + + state['step'] += 1 + bias_correction = 1 - beta ** state['step'] + alpha = (1 - beta) / bias_correction + + # Update the running average grad + delta_grad = grad - exp_avg_grad + exp_avg_grad.add_(delta_grad, alpha=alpha) + + denom = d_p.norm(p=4).add(group['eps']) + d_p.div_(denom) + v_sq = d_p.mul(d_p) + delta = delta_grad.div_(denom).mul_(d_p).sum().mul(-alpha) - B.mul(v_sq).sum() + + # Update B + B.addcmul_(v_sq, delta) + + # calc direction of parameter updates + denom = B.abs().clamp_(min=1) + d_p.copy_(exp_avg_grad.div(denom)) + + p.add_(d_p, alpha=-curr_lr) + + return loss diff --git a/over9000/diffgrad.py b/over9000/diffgrad.py new file mode 100644 index 0000000..8295de9 --- /dev/null +++ b/over9000/diffgrad.py @@ -0,0 +1,127 @@ + +import math +import torch +from torch.optim.optimizer import Optimizer +import numpy as np +import torch.nn as nn + +# Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py + +# modifications: @lessw2020 +# https://github.com/lessw2020/Best-Deep-Learning-Optimizers/blob/master/diffgrad/diffgrad.py + +class DiffGrad(Optimizer): + r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. + It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + .. _diffGrad: An Optimization Method for Convolutional Neural Networks: + https://arxiv.org/abs/1909.11015 + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super().__init__(params, defaults) + + #save version + self.version = version + + def __setstate__(self, state): + super().__setstate__(state) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + # Previous gradient + state['previous_grad'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + if group['weight_decay'] != 0: + grad.add_(group['weight_decay'], p.data) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # compute diffgrad coefficient (dfc) + + + if self.version==0: + diff = abs(previous_grad - grad) + elif self.version ==1: + diff = previous_grad-grad + elif self.version ==2: + diff = .5*abs(previous_grad - grad) + + if self.version==0 or self.version==1: + dfc = 1. / (1. + torch.exp(-diff)) + elif self.version==2: + dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 + + state['previous_grad'] = grad + + # update momentum with dfc + exp_avg1 = exp_avg * dfc + + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg1, denom) + + return loss \ No newline at end of file diff --git a/over9000/lamb.py b/over9000/lamb.py new file mode 100644 index 0000000..67adb67 --- /dev/null +++ b/over9000/lamb.py @@ -0,0 +1,129 @@ +"""Lamb optimizer.""" +# from https://github.com/cybertronai/pytorch-lamb/blob/master/pytorch_lamb/lamb.py + +import collections +import math + +import torch +from torch.optim import Optimizer + +try: + from tensorboardX import SummaryWriter + + def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int): + """Log a histogram of trust ratio scalars in across layers.""" + results = collections.defaultdict(list) + for group in optimizer.param_groups: + for p in group['params']: + state = optimizer.state[p] + for i in ('weight_norm', 'adam_norm', 'trust_ratio'): + if i in state: + results[i].append(state[i]) + + for k, v in results.items(): + event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count) +except ModuleNotFoundError as e: + print("To use this log_lamb_rs, please run 'pip install tensorboardx'. Also you must have Tensorboard running to see results") + +class Lamb(Optimizer): + r"""Implements Lamb algorithm. + + It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + adam (bool, optional): always use trust ratio = 1, which turns this into + Adam. Useful for comparison purposes. + + .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes: + https://arxiv.org/abs/1904.00962 + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, + weight_decay=0, adam=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay) + self.adam = adam + super(Lamb, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + # m_t + exp_avg.mul_(beta1).add_(1 - beta1, grad) + # v_t + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + # Paper v3 does not use debiasing. + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + # Apply bias to lr to avoid broadcast. + step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1 + + weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) + + adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps']) + if group['weight_decay'] != 0: + adam_step.add_(group['weight_decay'], p.data) + + adam_norm = adam_step.pow(2).sum().sqrt() + if weight_norm == 0 or adam_norm == 0: + trust_ratio = 1 + else: + trust_ratio = weight_norm / adam_norm + state['weight_norm'] = weight_norm + state['adam_norm'] = adam_norm + state['trust_ratio'] = trust_ratio + if self.adam: + trust_ratio = 1 + + p.data.add_(-step_size * trust_ratio, adam_step) + + return loss diff --git a/over9000/lookahead.py b/over9000/lookahead.py new file mode 100644 index 0000000..b54c518 --- /dev/null +++ b/over9000/lookahead.py @@ -0,0 +1,103 @@ +from torch.optim import Adam +import torch +from torch.optim import Optimizer +from collections import defaultdict + +class Lookahead(Optimizer): + ''' + PyTorch implementation of the lookahead wrapper. + Lookahead Optimizer: https://arxiv.org/abs/1907.08610 + ''' + def __init__(self, optimizer,alpha=0.5, k=6,pullback_momentum="none"): + ''' + :param optimizer:inner optimizer + :param k (int): number of lookahead steps + :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer. + :param pullback_momentum (str): change to inner optimizer momentum on interpolation update + ''' + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + self.optimizer = optimizer + self.param_groups = self.optimizer.param_groups + self.alpha = alpha + self.k = k + self.step_counter = 0 + assert pullback_momentum in ["reset", "pullback", "none"] + self.pullback_momentum = pullback_momentum + self.state = defaultdict(dict) + + # Cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['cached_params'] = torch.zeros_like(p.data) + param_state['cached_params'].copy_(p.data) + + def __getstate__(self): + return { + 'state': self.state, + 'optimizer': self.optimizer, + 'alpha': self.alpha, + 'step_counter': self.step_counter, + 'k':self.k, + 'pullback_momentum': self.pullback_momentum + } + + def zero_grad(self): + self.optimizer.zero_grad() + + def state_dict(self): + return self.optimizer.state_dict() + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict) + + def _backup_and_load_cache(self): + """Useful for performing evaluation on the slow weights (which typically generalize better) + """ + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['backup_params'] = torch.zeros_like(p.data) + param_state['backup_params'].copy_(p.data) + p.data.copy_(param_state['cached_params']) + + def _clear_and_load_backup(self): + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.copy_(param_state['backup_params']) + del param_state['backup_params'] + + def step(self, closure=None): + """Performs a single Lookahead optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = self.optimizer.step(closure) + self.step_counter += 1 + + if self.step_counter >= self.k: + self.step_counter = 0 + # Lookahead and cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params']) # crucial line + param_state['cached_params'].copy_(p.data) + if self.pullback_momentum == "pullback": + internal_momentum = self.optimizer.state[p]["momentum_buffer"] + self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_( + 1.0 - self.alpha, param_state["cached_mom"]) + param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] + elif self.pullback_momentum == "reset": + self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) + + return loss + +def LookaheadAdam(params, alpha=0.5, k=6, *args, **kwargs): + adam = Adam(params, *args, **kwargs) + return Lookahead(adam, alpha, k) \ No newline at end of file diff --git a/over9000/madam.py b/over9000/madam.py new file mode 100644 index 0000000..4212701 --- /dev/null +++ b/over9000/madam.py @@ -0,0 +1,47 @@ +# from here https://github.com/jxbz/madam/blob/master/pytorch/optim/madam.py +import torch +from torch.optim.optimizer import Optimizer, required + + +class Madam(Optimizer): + + def __init__(self, params, lr=0.01, p_scale=3.0, g_bound=10.0): + + self.p_scale = p_scale + self.g_bound = g_bound + defaults = dict(lr=lr) + super(Madam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['max'] = self.p_scale*(p*p).mean().sqrt().item() + state['step'] = 0 + state['exp_avg_sq'] = torch.zeros_like(p) + + state['step'] += 1 + bias_correction = 1 - 0.999 ** state['step'] + state['exp_avg_sq'] = 0.999 * state['exp_avg_sq'] + 0.001 * p.grad.data**2 + + g_normed = p.grad.data / (state['exp_avg_sq']/bias_correction).sqrt() + g_normed[torch.isnan(g_normed)] = 0 + g_normed.clamp_(-self.g_bound, self.g_bound) + + p.data *= torch.exp( -group['lr']*g_normed*torch.sign(p.data) ) + p.data.clamp_(-state['max'], state['max']) + + return loss diff --git a/over9000/madgrad.py b/over9000/madgrad.py new file mode 100644 index 0000000..bcbc90e --- /dev/null +++ b/over9000/madgrad.py @@ -0,0 +1,174 @@ +# from https://github.com/raw/facebookresearch/madgrad/master/madgrad/madgrad.py +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import TYPE_CHECKING, Any, Callable, Optional + +import torch +import torch.optim + +if TYPE_CHECKING: + from torch.optim.optimizer import _params_t +else: + _params_t = Any + + +class MADGRAD(torch.optim.Optimizer): + """ + MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic + Optimization. + + .. _MADGRAD: https://arxiv.org/abs/2101.11075 + + MADGRAD is a general purpose optimizer that can be used in place of SGD or + Adam may converge faster and generalize better. Currently GPU-only. + Typically, the same learning rate schedule that is used for SGD or Adam may + be used. The overall learning rate is not comparable to either method and + should be determined by a hyper-parameter sweep. + + MADGRAD requires less weight decay than other methods, often as little as + zero. Momentum values used for SGD or Adam's beta1 should work here also. + + On sparse problems both weight_decay and momentum should be set to 0. + + Arguments: + params (iterable): + Iterable of parameters to optimize or dicts defining parameter groups. + lr (float): + Learning rate (default: 1e-2). + momentum (float): + Momentum value in the range [0,1) (default: 0.9). + weight_decay (float): + Weight decay, i.e. a L2 penalty (default: 0). + eps (float): + Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). + """ + + def __init__( + self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6, + ): + if momentum < 0 or momentum >= 1: + raise ValueError(f"Momentum {momentum} must be in the range [0,1]") + if lr <= 0: + raise ValueError(f"Learning rate {lr} must be positive") + if weight_decay < 0: + raise ValueError(f"Weight decay {weight_decay} must be non-negative") + if eps < 0: + raise ValueError(f"Eps must be non-negative") + + defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, k=0) + super().__init__(params, defaults) + + for group in self.param_groups: + for p in group["params"]: + state = self.state[p] + + state["grad_sum_sq"] = torch.zeros_like(p.data).detach() + state["s"] = torch.zeros_like(p.data).detach() + if momentum != 0: + state["x0"] = torch.clone(p.data).detach() + + @property + def supports_memory_efficient_fp16(self) -> bool: + return False + + @property + def supports_flat_params(self) -> bool: + return True + + def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + eps = group["eps"] + k = group["k"] + lr = group["lr"] + eps + decay = group["weight_decay"] + momentum = group["momentum"] + + ck = 1 - momentum + lamb = lr * math.pow(k + 1, 0.5) + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data + state = self.state[p] + + if momentum != 0.0 and grad.is_sparse: + raise RuntimeError("momentum != 0 is not compatible with sparse gradients") + + grad_sum_sq = state["grad_sum_sq"] + s = state["s"] + + # Apply weight decay + if decay != 0: + if grad.is_sparse: + raise RuntimeError("weight_decay option is not compatible with sparse gradients") + + grad.add_(p.data, alpha=decay) + + if grad.is_sparse: + grad = grad.coalesce() + grad_val = grad._values() + + p_masked = p.sparse_mask(grad) + grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) + s_masked = s.sparse_mask(grad) + + # Compute x_0 from other known quantities + rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) + x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1) + + # Dense + sparse op + grad_sq = grad * grad + grad_sum_sq.add_(grad_sq, alpha=lamb) + grad_sum_sq_masked.add_(grad_sq, alpha=lamb) + + rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) + + s.add_(grad, alpha=lamb) + s_masked._values().add_(grad_val, alpha=lamb) + + # update masked copy of p + p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1) + # Copy updated masked p to dense p using an add operation + p_masked._values().add_(p_kp1_masked_vals, alpha=-1) + p.data.add_(p_masked, alpha=-1) + else: + if momentum == 0: + # Compute x_0 from other known quantities + rms = grad_sum_sq.pow(1 / 3).add_(eps) + x0 = p.data.addcdiv(s, rms, value=1) + else: + x0 = state["x0"] + + # Accumulate second moments + grad_sum_sq.addcmul_(grad, grad, value=lamb) + rms = grad_sum_sq.pow(1 / 3).add_(eps) + + # Update s + s.data.add_(grad, alpha=lamb) + + # Step + if momentum == 0: + p.data.copy_(x0.addcdiv(s, rms, value=-1)) + else: + z = x0.addcdiv(s, rms, value=-1) + + # p is a moving average of z + p.data.mul_(1 - ck).add_(z, alpha=ck) + + group["k"] = group["k"] + 1 + return loss diff --git a/over9000/novograd.py b/over9000/novograd.py new file mode 100644 index 0000000..5eea065 --- /dev/null +++ b/over9000/novograd.py @@ -0,0 +1,223 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch.optim import Optimizer +import math + +class AdamW(Optimizer): + """Implements AdamW algorithm. + + It has been proposed in `Adam: A Method for Stochastic Optimization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + + Adam: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) + + return loss + +class Novograd(Optimizer): + """ + Implements Novograd algorithm. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.95, 0)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging: gradient averaging + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, + weight_decay=0, grad_averaging=False, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + grad_averaging=grad_averaging, + amsgrad=amsgrad) + + super(Novograd, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Novograd, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Sparse gradients are not supported.') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + norm = torch.sum(torch.pow(grad, 2)) + + if exp_avg_sq == 0: + exp_avg_sq.copy_(norm) + else: + exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + grad.div_(denom) + if group['weight_decay'] != 0: + grad.add_(group['weight_decay'], p.data) + if group['grad_averaging']: + grad.mul_(1 - beta1) + exp_avg.mul_(beta1).add_(grad) + + p.data.add_(-group['lr'], exp_avg) + + return loss \ No newline at end of file diff --git a/over9000/radam.py b/over9000/radam.py new file mode 100644 index 0000000..74d7e0c --- /dev/null +++ b/over9000/radam.py @@ -0,0 +1,209 @@ +# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py + +import math +import torch +from torch.optim.optimizer import Optimizer, required + +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + step_size = 1.0 / (1 - beta1 ** state['step']) + buffered[2] = step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) + else: + p_data_fp32.add_(-step_size * group['lr'], exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + +class PlainRAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + step_size = group['lr'] / (1 - beta1 ** state['step']) + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, warmup = warmup) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + if group['warmup'] > state['step']: + scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] + else: + scheduled_lr = group['lr'] + + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/over9000/ralamb.py b/over9000/ralamb.py new file mode 100644 index 0000000..f7036c8 --- /dev/null +++ b/over9000/ralamb.py @@ -0,0 +1,99 @@ +import torch, math +from torch.optim.optimizer import Optimizer + +# RAdam + LARS +class Ralamb(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(Ralamb, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Ralamb, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Ralamb does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + # Decay the first and second moment running average coefficient + # m_t + exp_avg.mul_(beta1).add_(1 - beta1, grad) + # v_t + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + + if state['step'] == buffered[0]: + N_sma, radam_step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + radam_step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + radam_step_size = 1.0 / (1 - beta1 ** state['step']) + buffered[2] = radam_step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + radam_step = p_data_fp32.clone() + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + radam_step.addcdiv_(-radam_step_size * group['lr'], exp_avg, denom) + else: + radam_step.add_(-radam_step_size * group['lr'], exp_avg) + + radam_norm = radam_step.pow(2).sum().sqrt() + weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) + if weight_norm == 0 or radam_norm == 0: + trust_ratio = 1 + else: + trust_ratio = weight_norm / radam_norm + + state['weight_norm'] = weight_norm + state['adam_norm'] = radam_norm + state['trust_ratio'] = trust_ratio + + if N_sma >= 5: + p_data_fp32.addcdiv_(-radam_step_size * group['lr'] * trust_ratio, exp_avg, denom) + else: + p_data_fp32.add_(-radam_step_size * group['lr'] * trust_ratio, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/over9000/ranger.py b/over9000/ranger.py new file mode 100644 index 0000000..b6c8218 --- /dev/null +++ b/over9000/ranger.py @@ -0,0 +1,12 @@ + +import math +import torch +from torch.optim.optimizer import Optimizer, required +import itertools as it +from .lookahead import * +from .radam import * + +def Ranger(params, alpha=0.5, k=6, *args, **kwargs): + radam = RAdam(params, *args, **kwargs) + return Lookahead(radam, alpha, k) + diff --git a/over9000/rangerlars.py b/over9000/rangerlars.py new file mode 100644 index 0000000..3fcb39e --- /dev/null +++ b/over9000/rangerlars.py @@ -0,0 +1,14 @@ +import torch, math +from torch.optim.optimizer import Optimizer +import itertools as it +from .lookahead import * +from .ralamb import * + +# RAdam + LARS + LookAHead + +# Lookahead implementation from https://github.com/lonePatient/lookahead_pytorch/blob/master/optimizer.py +# RAdam + LARS implementation from https://gist.github.com/redknightlois/c4023d393eb8f92bb44b2ab582d7ec20 + +def RangerLars(params, alpha=0.5, k=6, *args, **kwargs): + ralamb = Ralamb(params, *args, **kwargs) + return Lookahead(ralamb, alpha, k) diff --git a/schedulers/__init__.py b/schedulers/__init__.py index 0ee3c44..a9a0ac5 100644 --- a/schedulers/__init__.py +++ b/schedulers/__init__.py @@ -1,13 +1,21 @@ import torch from torch.optim.lr_scheduler import ( - CosineAnnealingWarmRestarts, CosineAnnealingLR, + CosineAnnealingWarmRestarts, + CosineAnnealingLR, MultiStepLR, LambdaLR, OneCycleLR, ConstantLR, - #ReduceLROnPlateau, - #LinearLR, - #ExponentialLR + ReduceLROnPlateau, + LinearLR, + ExponentialLR, + MultiplicativeLR, + StepLR, + PolynomialLR, + ChainedScheduler, + SequentialLR, + CyclicLR, + LRScheduler ) # polylr from .polylr import PolyLR,PolyLR_WWP @@ -26,11 +34,32 @@ 'lambdalr' : LambdaLR, 'onecyclelr' : OneCycleLR, 'constantlr' : ConstantLR, - #'linearlr' : LinearLR, - #'exponentiallr' : ExponentialLR + 'linearlr' : LinearLR, + 'exponentiallr' : ExponentialLR, + 'reducelronplateau':ReduceLROnPlateau, + 'multiplicativelr':MultiplicativeLR, + 'steplr':StepLR, + 'polynomiallr':PolynomialLR, + 'chainedscheduler':ChainedScheduler, + 'sequentiallr':SequentialLR, + 'cycliclr':CyclicLR, } -# schedulers -def get_scheduler(name, optimizer, **kwargs): + + +def get_scheduler(name :str, optimizer :torch.optim.Optimizer, **kwargs) -> LRScheduler: + """ + This function returns the scheduler given its name + + @param name: name of the scheduler + @param params: Optimeizer to schedule + @param **kwargs:named parameters for the sheduler + + @type name:str + @type optimizer: torch.optim.Optimizer + + @return: torch.optim.lr_scheduler.LRScheduler + + """ name = name.lower() if name not in scheduler_mapping.keys(): raise ValueError(f'scheduler {name} is not implemented!!!') @@ -38,19 +67,37 @@ def get_scheduler(name, optimizer, **kwargs): return scheduler_mapping[name](optimizer=optimizer, **kwargs) class AutoScheduler: - + """ + This class automate the step of the schechduler, the user will call scheduler.step on every iteration and the scheduler once required + """ def __init__( self, - name, - optimizer, + name :str, + optimizer :torch.optim.Optimizer, data_loader = None, - total_epochs = None, - iters_per_epoch = None, - mode = 'per_epoch', #per_epoch #per_iter + total_epochs :int = None, + iters_per_epoch :int|None = None, + mode = 'per_epoch', **kwargs ): - assert isinstance(total_epochs,int) + """ + This function returns the scheduler given its name + + @param name: name of the scheduler + @param params: Optimeizer to schedule + @param data_loader: (optional) used to find the iters_per_epoch + @param total_epochs: (optional) + @param iters_per_epoch: (optional) + @param **kwargs:named parameters for the sheduler + + @type name:str + @type optimizer: torch.optim.Optimizer + + + """ + + assert isinstance(total_epochs,int), "the total_epochs must be an int" if data_loader is not None: self.iters_per_epoch = len(data_loader) elif isinstance(iters_per_epoch,int): @@ -71,17 +118,25 @@ def __init__( self.scheduler = get_scheduler(name=name,optimizer=optimizer,**kwargs) def set_mode(self,mode): + """ + sets mode of the iter_counter + @param mode: can be only 'per_epoch' or 'per_iter' + """ assert mode in ['per_epoch','per_iter'] self.mode = mode self.set_stepsize() def set_stepsize(self): + """define stepsize""" if self.mode == 'per_epoch': self.stepsize = self.iters_per_epoch else: self.stepsize = 1 def step(self): + """ + done in each iteration, steps only when iter_counter == self.stepsize + """ self.iter_counter += 1 if self.iter_counter > self.total_iters: diff --git a/schedulers/polylr.py b/schedulers/polylr.py index 6ec413f..c5afbf1 100644 --- a/schedulers/polylr.py +++ b/schedulers/polylr.py @@ -1,5 +1,4 @@ -import torch -from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, MultiStepLR,LambdaLR +from torch.optim.lr_scheduler import LambdaLR # polylr class PolyLR_WWP(LambdaLR): From 07c8b691e126f1e63e6b32d299fb4860371e3d58 Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Mon, 25 Dec 2023 13:44:06 +0200 Subject: [PATCH 02/20] Delete over9000 directory --- over9000/__init__.py | 14 --- over9000/adabelief.py | 222 ---------------------------------------- over9000/adamod.py | 98 ------------------ over9000/adan.py | 156 ---------------------------- over9000/apollo.py | 115 --------------------- over9000/diffgrad.py | 127 ----------------------- over9000/lamb.py | 129 ------------------------ over9000/lookahead.py | 103 ------------------- over9000/madam.py | 47 --------- over9000/madgrad.py | 174 -------------------------------- over9000/novograd.py | 223 ----------------------------------------- over9000/radam.py | 209 -------------------------------------- over9000/ralamb.py | 99 ------------------ over9000/ranger.py | 12 --- over9000/rangerlars.py | 14 --- 15 files changed, 1742 deletions(-) delete mode 100644 over9000/__init__.py delete mode 100644 over9000/adabelief.py delete mode 100644 over9000/adamod.py delete mode 100644 over9000/adan.py delete mode 100644 over9000/apollo.py delete mode 100644 over9000/diffgrad.py delete mode 100644 over9000/lamb.py delete mode 100644 over9000/lookahead.py delete mode 100644 over9000/madam.py delete mode 100644 over9000/madgrad.py delete mode 100644 over9000/novograd.py delete mode 100644 over9000/radam.py delete mode 100644 over9000/ralamb.py delete mode 100644 over9000/ranger.py delete mode 100644 over9000/rangerlars.py diff --git a/over9000/__init__.py b/over9000/__init__.py deleted file mode 100644 index 7af485f..0000000 --- a/over9000/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from .radam import PlainRAdam,RAdam -from .novograd import Novograd -from .ranger import Ranger -from .ralamb import Ralamb -from .rangerlars import RangerLars -from .lookahead import Lookahead,LookaheadAdam -#from .lamb import Lamb -from .diffgrad import DiffGrad -from .adamod import AdaMod -from .madam import Madam -from .apollo import Apollo -from .adabelief import AdaBelief -from .madgrad import MADGRAD -from .adan import Adan \ No newline at end of file diff --git a/over9000/adabelief.py b/over9000/adabelief.py deleted file mode 100644 index 2f817d6..0000000 --- a/over9000/adabelief.py +++ /dev/null @@ -1,222 +0,0 @@ -# from https://github.com/raw/juntang-zhuang/Adabelief-Optimizer/master/PyTorch_Experiments/AdaBelief.py -import math -import torch -from torch.optim.optimizer import Optimizer -from tabulate import tabulate -from colorama import Fore, Back, Style - -version_higher = ( torch.__version__ >= "1.5.0" ) - -class AdaBelief(Optimizer): - r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-16) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - weight_decouple (boolean, optional): ( default: True) If set as True, then - the optimizer uses decoupled weight decay as in AdamW - fixed_decay (boolean, optional): (default: False) This is used when weight_decouple - is set as True. - When fixed_decay == True, the weight decay is performed as - $W_{new} = W_{old} - W_{old} \times decay$. - When fixed_decay == False, the weight decay is performed as - $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the - weight decay ratio decreases with learning rate (lr). - rectify (boolean, optional): (default: True) If set as True, then perform the rectified - update similar to RAdam - degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update - when variance of gradient is high - reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, - weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True, - degenerated_to_sgd=True): - - # ------------------------------------------------------------------------------ - # Print modifications to default arguments - print(Fore.RED + 'Please check your arguments if you have upgraded adabelief-pytorch from version 0.0.5.') - print(Fore.RED + 'Modifications to default arguments:') - default_table = tabulate([ - ['adabelief-pytorch=0.0.5','1e-8','False','False'], - ['Current version (0.1.0)','1e-16','True','True']], - headers=['eps','weight_decouple','rectify']) - print(Fore.RED + default_table) - - print(Fore.RED +'For a complete table of recommended hyperparameters, see') - print(Fore.RED + 'https://github.com/juntang-zhuang/Adabelief-Optimizer') - - print(Style.RESET_ALL) - # ------------------------------------------------------------------------------ - - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - - self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): - for param in params: - if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): - param['buffer'] = [[None, None, None] for _ in range(10)] - - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)]) - super(AdaBelief, self).__init__(params, defaults) - - self.degenerated_to_sgd = degenerated_to_sgd - self.weight_decouple = weight_decouple - self.rectify = rectify - self.fixed_decay = fixed_decay - if self.weight_decouple: - print('Weight decoupling enabled in AdaBelief') - if self.fixed_decay: - print('Weight decay fixed') - if self.rectify: - print('Rectification enabled in AdaBelief') - if amsgrad: - print('AMSGrad enabled in AdaBelief') - - def __setstate__(self, state): - super(AdaBelief, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def reset(self): - for group in self.param_groups: - for p in group['params']: - state = self.state[p] - amsgrad = group['amsgrad'] - - # State initialization - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - # Exponential moving average of squared gradient values - state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'AdaBelief does not support sparse gradients, please consider SparseAdam instead') - amsgrad = group['amsgrad'] - - state = self.state[p] - - beta1, beta2 = group['betas'] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - # get current state variable - exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var'] - - state['step'] += 1 - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - # Update first and second moment running average - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - grad_residual = grad - exp_avg - exp_avg_var.mul_(beta2).addcmul_( grad_residual, grad_residual, value=1 - beta2) - - if amsgrad: - max_exp_avg_var = state['max_exp_avg_var'] - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var) - - # Use the max. for normalizing running avg. of gradient - denom = (max_exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) - else: - denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) - - # perform weight decay, check if decoupled weight decay - if self.weight_decouple: - if not self.fixed_decay: - p.data.mul_(1.0 - group['lr'] * group['weight_decay']) - else: - p.data.mul_(1.0 - group['weight_decay']) - else: - if group['weight_decay'] != 0: - grad.add_(p.data, alpha=group['weight_decay']) - - # update - if not self.rectify: - # Default update - step_size = group['lr'] / bias_correction1 - p.data.addcdiv_( exp_avg, denom, value=-step_size) - - else: # Rectified update, forked from RAdam - buffered = group['buffer'][int(state['step'] % 10)] - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = math.sqrt( - (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( - N_sma_max - 2)) / (1 - beta1 ** state['step']) - elif self.degenerated_to_sgd: - step_size = 1.0 / (1 - beta1 ** state['step']) - else: - step_size = -1 - buffered[2] = step_size - - if N_sma >= 5: - denom = exp_avg_var.sqrt().add_(group['eps']) - p.data.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) - elif step_size > 0: - p.data.add_( exp_avg, alpha=-step_size * group['lr']) - - return loss \ No newline at end of file diff --git a/over9000/adamod.py b/over9000/adamod.py deleted file mode 100644 index b345560..0000000 --- a/over9000/adamod.py +++ /dev/null @@ -1,98 +0,0 @@ -# original repo https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py -import math -import torch -from torch.optim import Optimizer - -class AdaMod(Optimizer): - """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) - It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), beta3=0.999, - eps=1e-8, weight_decay=0): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= beta3 < 1.0: - raise ValueError("Invalid beta3 parameter: {}".format(beta3)) - defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, - weight_decay=weight_decay) - super(AdaMod, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdaMod, self).__setstate__(state) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'AdaMod does not support sparse gradients') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - # Exponential moving average of actual learning rates - state['exp_avg_lr'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - if group['weight_decay'] != 0: - p.data.add_(-group['weight_decay'] * group['lr'], p.data) - - # Applies momental bounds on actual learning rates - step_size = torch.full_like(denom, step_size) - step_size.div_(denom) - exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) - step_size = torch.min(step_size, exp_avg_lr) - step_size.mul_(exp_avg) - - p.data.add_(-step_size) - - return loss diff --git a/over9000/adan.py b/over9000/adan.py deleted file mode 100644 index 8206a92..0000000 --- a/over9000/adan.py +++ /dev/null @@ -1,156 +0,0 @@ -# https://github.com/raw/sail-sg/Adan/main/adan.py - -# Copyright 2022 Garena Online Private Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import math -import torch -from torch.optim.optimizer import Optimizer -from timm.utils import * - - -class Adan(Optimizer): - """ - Implements a pytorch variant of Adan - - Adan was proposed in - Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. - https://arxiv.org/abs/2208.06677 - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining parameter groups. - lr (float, optional): learning rate. (default: 1e-3) - betas (Tuple[float, float, flot], optional): coefficients used for computing - running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) - eps (float, optional): term added to the denominator to improve - numerical stability. (default: 1e-8) - weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) - max_grad_norm (float, optional): value used to clip - global grad norm (default: 0.0 no clip) - no_prox (bool): how to perform the decoupled weight decay (default: False) - """ - - def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, - weight_decay=0.0, max_grad_norm=0.0, no_prox=False): - if not 0.0 <= max_grad_norm: - raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= betas[2] < 1.0: - raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, - max_grad_norm=max_grad_norm, no_prox=no_prox) - super(Adan, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Adan, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('no_prox', False) - - @torch.no_grad() - def restart_opt(self): - for group in self.param_groups: - group['step'] = 0 - for p in group['params']: - if p.requires_grad: - state = self.state[p] - # State initialization - - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p) - # Exponential moving average of gradient difference - state['exp_avg_diff'] = torch.zeros_like(p) - - @torch.no_grad() - def step(self): - """ - Performs a single optimization step. - """ - if self.defaults['max_grad_norm'] > 0: - device = self.param_groups[0]['params'][0].device - global_grad_norm = torch.zeros(1, device=device) - - max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) - for group in self.param_groups: - - for p in group['params']: - if p.grad is not None: - grad = p.grad - global_grad_norm.add_(grad.pow(2).sum()) - - global_grad_norm = torch.sqrt(global_grad_norm) - - clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) - else: - clip_global_grad_norm = 1.0 - - for group in self.param_groups: - beta1, beta2, beta3 = group['betas'] - # assume same step across group now to simplify things - # per parameter step can be easily support by making it tensor, or pass list into kernel - if 'step' in group: - group['step'] += 1 - else: - group['step'] = 1 - - bias_correction1 = 1.0 - beta1 ** group['step'] - - bias_correction2 = 1.0 - beta2 ** group['step'] - - bias_correction3 = 1.0 - beta3 ** group['step'] - - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - if len(state) == 0: - state['exp_avg'] = torch.zeros_like(p) - state['exp_avg_sq'] = torch.zeros_like(p) - state['exp_avg_diff'] = torch.zeros_like(p) - - grad = p.grad.mul_(clip_global_grad_norm) - if 'pre_grad' not in state or group['step'] == 1: - state['pre_grad'] = grad - - copy_grad = grad.clone() - - exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] - diff = grad - state['pre_grad'] - - update = grad + beta2 * diff - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t - exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t - exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t - - denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) - update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) - - if group['no_prox']: - p.data.mul_(1 - group['lr'] * group['weight_decay']) - p.add_(update, alpha=-group['lr']) - else: - p.add_(update, alpha=-group['lr']) - p.data.div_(1 + group['lr'] * group['weight_decay']) - - state['pre_grad'] = copy_grad diff --git a/over9000/apollo.py b/over9000/apollo.py deleted file mode 100644 index 2dd8656..0000000 --- a/over9000/apollo.py +++ /dev/null @@ -1,115 +0,0 @@ -# from https://github.com/raw/XuezheMax/apollo/master/optim/apollo.py -import torch -from torch.optim.optimizer import Optimizer - - -class Apollo(Optimizer): - r"""Implements Atom algorithm. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float): learning rate - beta (float, optional): coefficient used for computing - running averages of gradient (default: 0.9) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-4) - warmup (int, optional): number of warmup steps (default: 0) - init_lr (float, optional): initial learning rate for warmup (default: 0.01) - weight_decay (float, optional): weight decay coefficient (default: 0) - - """ - - def __init__(self, params, lr, beta=0.9, eps=1e-4, warmup=100, init_lr=0.01, weight_decay=0): - if not 0.0 < lr: - raise ValueError("Invalid learning rate value: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= beta < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(beta)) - if not 0.0 <= weight_decay: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - if not 0.0 <= warmup: - raise ValueError("Invalid warmup updates: {}".format(warmup)) - if not 0.0 <= init_lr <= 1.0: - raise ValueError("Invalid initial learning rate: {}".format(init_lr)) - - defaults = dict(lr=lr, beta=beta, eps=eps, warmup=warmup, - init_lr=init_lr, base_lr=lr, weight_decay=weight_decay) - super(Apollo, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Apollo, self).__setstate__(state) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg_grad'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Exponential moving average of squared gradient values - state['approx_hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Previous update direction - state['update'] = torch.zeros_like(p, memory_format=torch.preserve_format) - - # Calculate current lr - if state['step'] < group['warmup']: - curr_lr = (group['base_lr'] - group['init_lr']) * state['step'] / group['warmup'] + group['init_lr'] - else: - curr_lr = group['lr'] - - # Perform optimization step - grad = p.grad - if grad.is_sparse: - raise RuntimeError('Atom does not support sparse gradients.') - - # Perform step weight decay - if group['weight_decay'] != 0: - grad = grad.add(p, alpha=group['weight_decay']) - - beta = group['beta'] - exp_avg_grad = state['exp_avg_grad'] - B = state['approx_hessian'] - d_p = state['update'] - - state['step'] += 1 - bias_correction = 1 - beta ** state['step'] - alpha = (1 - beta) / bias_correction - - # Update the running average grad - delta_grad = grad - exp_avg_grad - exp_avg_grad.add_(delta_grad, alpha=alpha) - - denom = d_p.norm(p=4).add(group['eps']) - d_p.div_(denom) - v_sq = d_p.mul(d_p) - delta = delta_grad.div_(denom).mul_(d_p).sum().mul(-alpha) - B.mul(v_sq).sum() - - # Update B - B.addcmul_(v_sq, delta) - - # calc direction of parameter updates - denom = B.abs().clamp_(min=1) - d_p.copy_(exp_avg_grad.div(denom)) - - p.add_(d_p, alpha=-curr_lr) - - return loss diff --git a/over9000/diffgrad.py b/over9000/diffgrad.py deleted file mode 100644 index 8295de9..0000000 --- a/over9000/diffgrad.py +++ /dev/null @@ -1,127 +0,0 @@ - -import math -import torch -from torch.optim.optimizer import Optimizer -import numpy as np -import torch.nn as nn - -# Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py - -# modifications: @lessw2020 -# https://github.com/lessw2020/Best-Deep-Learning-Optimizers/blob/master/diffgrad/diffgrad.py - -class DiffGrad(Optimizer): - r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. - It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - .. _diffGrad: An Optimization Method for Convolutional Neural Networks: - https://arxiv.org/abs/1909.11015 - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - - - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - - super().__init__(params, defaults) - - #save version - self.version = version - - def __setstate__(self, state): - super().__setstate__(state) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - # Previous gradient - state['previous_grad'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad.add_(group['weight_decay'], p.data) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - # compute diffgrad coefficient (dfc) - - - if self.version==0: - diff = abs(previous_grad - grad) - elif self.version ==1: - diff = previous_grad-grad - elif self.version ==2: - diff = .5*abs(previous_grad - grad) - - if self.version==0 or self.version==1: - dfc = 1. / (1. + torch.exp(-diff)) - elif self.version==2: - dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 - - state['previous_grad'] = grad - - # update momentum with dfc - exp_avg1 = exp_avg * dfc - - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - p.data.addcdiv_(-step_size, exp_avg1, denom) - - return loss \ No newline at end of file diff --git a/over9000/lamb.py b/over9000/lamb.py deleted file mode 100644 index 67adb67..0000000 --- a/over9000/lamb.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Lamb optimizer.""" -# from https://github.com/cybertronai/pytorch-lamb/blob/master/pytorch_lamb/lamb.py - -import collections -import math - -import torch -from torch.optim import Optimizer - -try: - from tensorboardX import SummaryWriter - - def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int): - """Log a histogram of trust ratio scalars in across layers.""" - results = collections.defaultdict(list) - for group in optimizer.param_groups: - for p in group['params']: - state = optimizer.state[p] - for i in ('weight_norm', 'adam_norm', 'trust_ratio'): - if i in state: - results[i].append(state[i]) - - for k, v in results.items(): - event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count) -except ModuleNotFoundError as e: - print("To use this log_lamb_rs, please run 'pip install tensorboardx'. Also you must have Tensorboard running to see results") - -class Lamb(Optimizer): - r"""Implements Lamb algorithm. - - It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - adam (bool, optional): always use trust ratio = 1, which turns this into - Adam. Useful for comparison purposes. - - .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes: - https://arxiv.org/abs/1904.00962 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, - weight_decay=0, adam=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay) - self.adam = adam - super(Lamb, self).__init__(params, defaults) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - # Decay the first and second moment running average coefficient - # m_t - exp_avg.mul_(beta1).add_(1 - beta1, grad) - # v_t - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - # Paper v3 does not use debiasing. - # bias_correction1 = 1 - beta1 ** state['step'] - # bias_correction2 = 1 - beta2 ** state['step'] - # Apply bias to lr to avoid broadcast. - step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1 - - weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) - - adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps']) - if group['weight_decay'] != 0: - adam_step.add_(group['weight_decay'], p.data) - - adam_norm = adam_step.pow(2).sum().sqrt() - if weight_norm == 0 or adam_norm == 0: - trust_ratio = 1 - else: - trust_ratio = weight_norm / adam_norm - state['weight_norm'] = weight_norm - state['adam_norm'] = adam_norm - state['trust_ratio'] = trust_ratio - if self.adam: - trust_ratio = 1 - - p.data.add_(-step_size * trust_ratio, adam_step) - - return loss diff --git a/over9000/lookahead.py b/over9000/lookahead.py deleted file mode 100644 index b54c518..0000000 --- a/over9000/lookahead.py +++ /dev/null @@ -1,103 +0,0 @@ -from torch.optim import Adam -import torch -from torch.optim import Optimizer -from collections import defaultdict - -class Lookahead(Optimizer): - ''' - PyTorch implementation of the lookahead wrapper. - Lookahead Optimizer: https://arxiv.org/abs/1907.08610 - ''' - def __init__(self, optimizer,alpha=0.5, k=6,pullback_momentum="none"): - ''' - :param optimizer:inner optimizer - :param k (int): number of lookahead steps - :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer. - :param pullback_momentum (str): change to inner optimizer momentum on interpolation update - ''' - if not 0.0 <= alpha <= 1.0: - raise ValueError(f'Invalid slow update rate: {alpha}') - if not 1 <= k: - raise ValueError(f'Invalid lookahead steps: {k}') - self.optimizer = optimizer - self.param_groups = self.optimizer.param_groups - self.alpha = alpha - self.k = k - self.step_counter = 0 - assert pullback_momentum in ["reset", "pullback", "none"] - self.pullback_momentum = pullback_momentum - self.state = defaultdict(dict) - - # Cache the current optimizer parameters - for group in self.optimizer.param_groups: - for p in group['params']: - param_state = self.state[p] - param_state['cached_params'] = torch.zeros_like(p.data) - param_state['cached_params'].copy_(p.data) - - def __getstate__(self): - return { - 'state': self.state, - 'optimizer': self.optimizer, - 'alpha': self.alpha, - 'step_counter': self.step_counter, - 'k':self.k, - 'pullback_momentum': self.pullback_momentum - } - - def zero_grad(self): - self.optimizer.zero_grad() - - def state_dict(self): - return self.optimizer.state_dict() - - def load_state_dict(self, state_dict): - self.optimizer.load_state_dict(state_dict) - - def _backup_and_load_cache(self): - """Useful for performing evaluation on the slow weights (which typically generalize better) - """ - for group in self.optimizer.param_groups: - for p in group['params']: - param_state = self.state[p] - param_state['backup_params'] = torch.zeros_like(p.data) - param_state['backup_params'].copy_(p.data) - p.data.copy_(param_state['cached_params']) - - def _clear_and_load_backup(self): - for group in self.optimizer.param_groups: - for p in group['params']: - param_state = self.state[p] - p.data.copy_(param_state['backup_params']) - del param_state['backup_params'] - - def step(self, closure=None): - """Performs a single Lookahead optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = self.optimizer.step(closure) - self.step_counter += 1 - - if self.step_counter >= self.k: - self.step_counter = 0 - # Lookahead and cache the current optimizer parameters - for group in self.optimizer.param_groups: - for p in group['params']: - param_state = self.state[p] - p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params']) # crucial line - param_state['cached_params'].copy_(p.data) - if self.pullback_momentum == "pullback": - internal_momentum = self.optimizer.state[p]["momentum_buffer"] - self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_( - 1.0 - self.alpha, param_state["cached_mom"]) - param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] - elif self.pullback_momentum == "reset": - self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) - - return loss - -def LookaheadAdam(params, alpha=0.5, k=6, *args, **kwargs): - adam = Adam(params, *args, **kwargs) - return Lookahead(adam, alpha, k) \ No newline at end of file diff --git a/over9000/madam.py b/over9000/madam.py deleted file mode 100644 index 4212701..0000000 --- a/over9000/madam.py +++ /dev/null @@ -1,47 +0,0 @@ -# from here https://github.com/jxbz/madam/blob/master/pytorch/optim/madam.py -import torch -from torch.optim.optimizer import Optimizer, required - - -class Madam(Optimizer): - - def __init__(self, params, lr=0.01, p_scale=3.0, g_bound=10.0): - - self.p_scale = p_scale - self.g_bound = g_bound - defaults = dict(lr=lr) - super(Madam, self).__init__(params, defaults) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - if len(state) == 0: - state['max'] = self.p_scale*(p*p).mean().sqrt().item() - state['step'] = 0 - state['exp_avg_sq'] = torch.zeros_like(p) - - state['step'] += 1 - bias_correction = 1 - 0.999 ** state['step'] - state['exp_avg_sq'] = 0.999 * state['exp_avg_sq'] + 0.001 * p.grad.data**2 - - g_normed = p.grad.data / (state['exp_avg_sq']/bias_correction).sqrt() - g_normed[torch.isnan(g_normed)] = 0 - g_normed.clamp_(-self.g_bound, self.g_bound) - - p.data *= torch.exp( -group['lr']*g_normed*torch.sign(p.data) ) - p.data.clamp_(-state['max'], state['max']) - - return loss diff --git a/over9000/madgrad.py b/over9000/madgrad.py deleted file mode 100644 index bcbc90e..0000000 --- a/over9000/madgrad.py +++ /dev/null @@ -1,174 +0,0 @@ -# from https://github.com/raw/facebookresearch/madgrad/master/madgrad/madgrad.py -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math -from typing import TYPE_CHECKING, Any, Callable, Optional - -import torch -import torch.optim - -if TYPE_CHECKING: - from torch.optim.optimizer import _params_t -else: - _params_t = Any - - -class MADGRAD(torch.optim.Optimizer): - """ - MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic - Optimization. - - .. _MADGRAD: https://arxiv.org/abs/2101.11075 - - MADGRAD is a general purpose optimizer that can be used in place of SGD or - Adam may converge faster and generalize better. Currently GPU-only. - Typically, the same learning rate schedule that is used for SGD or Adam may - be used. The overall learning rate is not comparable to either method and - should be determined by a hyper-parameter sweep. - - MADGRAD requires less weight decay than other methods, often as little as - zero. Momentum values used for SGD or Adam's beta1 should work here also. - - On sparse problems both weight_decay and momentum should be set to 0. - - Arguments: - params (iterable): - Iterable of parameters to optimize or dicts defining parameter groups. - lr (float): - Learning rate (default: 1e-2). - momentum (float): - Momentum value in the range [0,1) (default: 0.9). - weight_decay (float): - Weight decay, i.e. a L2 penalty (default: 0). - eps (float): - Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). - """ - - def __init__( - self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6, - ): - if momentum < 0 or momentum >= 1: - raise ValueError(f"Momentum {momentum} must be in the range [0,1]") - if lr <= 0: - raise ValueError(f"Learning rate {lr} must be positive") - if weight_decay < 0: - raise ValueError(f"Weight decay {weight_decay} must be non-negative") - if eps < 0: - raise ValueError(f"Eps must be non-negative") - - defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, k=0) - super().__init__(params, defaults) - - for group in self.param_groups: - for p in group["params"]: - state = self.state[p] - - state["grad_sum_sq"] = torch.zeros_like(p.data).detach() - state["s"] = torch.zeros_like(p.data).detach() - if momentum != 0: - state["x0"] = torch.clone(p.data).detach() - - @property - def supports_memory_efficient_fp16(self) -> bool: - return False - - @property - def supports_flat_params(self) -> bool: - return True - - def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - eps = group["eps"] - k = group["k"] - lr = group["lr"] + eps - decay = group["weight_decay"] - momentum = group["momentum"] - - ck = 1 - momentum - lamb = lr * math.pow(k + 1, 0.5) - - for p in group["params"]: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - if momentum != 0.0 and grad.is_sparse: - raise RuntimeError("momentum != 0 is not compatible with sparse gradients") - - grad_sum_sq = state["grad_sum_sq"] - s = state["s"] - - # Apply weight decay - if decay != 0: - if grad.is_sparse: - raise RuntimeError("weight_decay option is not compatible with sparse gradients") - - grad.add_(p.data, alpha=decay) - - if grad.is_sparse: - grad = grad.coalesce() - grad_val = grad._values() - - p_masked = p.sparse_mask(grad) - grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) - s_masked = s.sparse_mask(grad) - - # Compute x_0 from other known quantities - rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) - x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1) - - # Dense + sparse op - grad_sq = grad * grad - grad_sum_sq.add_(grad_sq, alpha=lamb) - grad_sum_sq_masked.add_(grad_sq, alpha=lamb) - - rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) - - s.add_(grad, alpha=lamb) - s_masked._values().add_(grad_val, alpha=lamb) - - # update masked copy of p - p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1) - # Copy updated masked p to dense p using an add operation - p_masked._values().add_(p_kp1_masked_vals, alpha=-1) - p.data.add_(p_masked, alpha=-1) - else: - if momentum == 0: - # Compute x_0 from other known quantities - rms = grad_sum_sq.pow(1 / 3).add_(eps) - x0 = p.data.addcdiv(s, rms, value=1) - else: - x0 = state["x0"] - - # Accumulate second moments - grad_sum_sq.addcmul_(grad, grad, value=lamb) - rms = grad_sum_sq.pow(1 / 3).add_(eps) - - # Update s - s.data.add_(grad, alpha=lamb) - - # Step - if momentum == 0: - p.data.copy_(x0.addcdiv(s, rms, value=-1)) - else: - z = x0.addcdiv(s, rms, value=-1) - - # p is a moving average of z - p.data.mul_(1 - ck).add_(z, alpha=ck) - - group["k"] = group["k"] + 1 - return loss diff --git a/over9000/novograd.py b/over9000/novograd.py deleted file mode 100644 index 5eea065..0000000 --- a/over9000/novograd.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch.optim import Optimizer -import math - -class AdamW(Optimizer): - """Implements AdamW algorithm. - - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - - Adam: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0, amsgrad=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - amsgrad = group['amsgrad'] - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - if amsgrad: - max_exp_avg_sq = state['max_exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) - # Use the max. for normalizing running avg. of gradient - denom = max_exp_avg_sq.sqrt().add_(group['eps']) - else: - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) - - return loss - -class Novograd(Optimizer): - """ - Implements Novograd algorithm. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.95, 0)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - grad_averaging: gradient averaging - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - """ - - def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, - weight_decay=0, grad_averaging=False, amsgrad=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, - grad_averaging=grad_averaging, - amsgrad=amsgrad) - - super(Novograd, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Novograd, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Sparse gradients are not supported.') - amsgrad = group['amsgrad'] - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - if amsgrad: - max_exp_avg_sq = state['max_exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - norm = torch.sum(torch.pow(grad, 2)) - - if exp_avg_sq == 0: - exp_avg_sq.copy_(norm) - else: - exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) - - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) - # Use the max. for normalizing running avg. of gradient - denom = max_exp_avg_sq.sqrt().add_(group['eps']) - else: - denom = exp_avg_sq.sqrt().add_(group['eps']) - - grad.div_(denom) - if group['weight_decay'] != 0: - grad.add_(group['weight_decay'], p.data) - if group['grad_averaging']: - grad.mul_(1 - beta1) - exp_avg.mul_(beta1).add_(grad) - - p.data.add_(-group['lr'], exp_avg) - - return loss \ No newline at end of file diff --git a/over9000/radam.py b/over9000/radam.py deleted file mode 100644 index 74d7e0c..0000000 --- a/over9000/radam.py +++ /dev/null @@ -1,209 +0,0 @@ -# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py - -import math -import torch -from torch.optim.optimizer import Optimizer, required - -class RAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - self.buffer = [[None, None, None] for ind in range(10)] - super(RAdam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(RAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - state['step'] += 1 - buffered = self.buffer[int(state['step'] % 10)] - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - else: - step_size = 1.0 / (1 - beta1 ** state['step']) - buffered[2] = step_size - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - if N_sma >= 5: - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) - else: - p_data_fp32.add_(-step_size * group['lr'], exp_avg) - - p.data.copy_(p_data_fp32) - - return loss - -class PlainRAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - - super(PlainRAdam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(PlainRAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - state['step'] += 1 - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - else: - step_size = group['lr'] / (1 - beta1 ** state['step']) - p_data_fp32.add_(-step_size, exp_avg) - - p.data.copy_(p_data_fp32) - - return loss - - -class AdamW(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, warmup = warmup) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - - def step(self, closure=None): - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - if group['warmup'] > state['step']: - scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] - else: - scheduled_lr = group['lr'] - - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) - - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - - p.data.copy_(p_data_fp32) - - return loss diff --git a/over9000/ralamb.py b/over9000/ralamb.py deleted file mode 100644 index f7036c8..0000000 --- a/over9000/ralamb.py +++ /dev/null @@ -1,99 +0,0 @@ -import torch, math -from torch.optim.optimizer import Optimizer - -# RAdam + LARS -class Ralamb(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - self.buffer = [[None, None, None] for ind in range(10)] - super(Ralamb, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Ralamb, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Ralamb does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - # Decay the first and second moment running average coefficient - # m_t - exp_avg.mul_(beta1).add_(1 - beta1, grad) - # v_t - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - state['step'] += 1 - buffered = self.buffer[int(state['step'] % 10)] - - if state['step'] == buffered[0]: - N_sma, radam_step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - radam_step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - else: - radam_step_size = 1.0 / (1 - beta1 ** state['step']) - buffered[2] = radam_step_size - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - radam_step = p_data_fp32.clone() - if N_sma >= 5: - denom = exp_avg_sq.sqrt().add_(group['eps']) - radam_step.addcdiv_(-radam_step_size * group['lr'], exp_avg, denom) - else: - radam_step.add_(-radam_step_size * group['lr'], exp_avg) - - radam_norm = radam_step.pow(2).sum().sqrt() - weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) - if weight_norm == 0 or radam_norm == 0: - trust_ratio = 1 - else: - trust_ratio = weight_norm / radam_norm - - state['weight_norm'] = weight_norm - state['adam_norm'] = radam_norm - state['trust_ratio'] = trust_ratio - - if N_sma >= 5: - p_data_fp32.addcdiv_(-radam_step_size * group['lr'] * trust_ratio, exp_avg, denom) - else: - p_data_fp32.add_(-radam_step_size * group['lr'] * trust_ratio, exp_avg) - - p.data.copy_(p_data_fp32) - - return loss diff --git a/over9000/ranger.py b/over9000/ranger.py deleted file mode 100644 index b6c8218..0000000 --- a/over9000/ranger.py +++ /dev/null @@ -1,12 +0,0 @@ - -import math -import torch -from torch.optim.optimizer import Optimizer, required -import itertools as it -from .lookahead import * -from .radam import * - -def Ranger(params, alpha=0.5, k=6, *args, **kwargs): - radam = RAdam(params, *args, **kwargs) - return Lookahead(radam, alpha, k) - diff --git a/over9000/rangerlars.py b/over9000/rangerlars.py deleted file mode 100644 index 3fcb39e..0000000 --- a/over9000/rangerlars.py +++ /dev/null @@ -1,14 +0,0 @@ -import torch, math -from torch.optim.optimizer import Optimizer -import itertools as it -from .lookahead import * -from .ralamb import * - -# RAdam + LARS + LookAHead - -# Lookahead implementation from https://github.com/lonePatient/lookahead_pytorch/blob/master/optimizer.py -# RAdam + LARS implementation from https://gist.github.com/redknightlois/c4023d393eb8f92bb44b2ab582d7ec20 - -def RangerLars(params, alpha=0.5, k=6, *args, **kwargs): - ralamb = Ralamb(params, *args, **kwargs) - return Lookahead(ralamb, alpha, k) From 755757680754c3c04180f12cb8b506ad9df3014d Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Mon, 25 Dec 2023 13:44:27 +0200 Subject: [PATCH 03/20] Delete optimizers/over9000 directory --- optimizers/over9000/__init__.py | 14 -- optimizers/over9000/adabelief.py | 222 ----------------------------- optimizers/over9000/adamod.py | 98 ------------- optimizers/over9000/adan.py | 156 --------------------- optimizers/over9000/apollo.py | 115 --------------- optimizers/over9000/diffgrad.py | 127 ----------------- optimizers/over9000/lamb.py | 129 ----------------- optimizers/over9000/lookahead.py | 97 ------------- optimizers/over9000/losses.py | 226 ------------------------------ optimizers/over9000/madam.py | 47 ------- optimizers/over9000/madgrad.py | 174 ----------------------- optimizers/over9000/meters.py | 47 ------- optimizers/over9000/metrics.py | 92 ------------ optimizers/over9000/mixup.py | 59 -------- optimizers/over9000/novograd.py | 223 ----------------------------- optimizers/over9000/optimizers.py | 98 ------------- optimizers/over9000/radam.py | 209 --------------------------- optimizers/over9000/ralamb.py | 99 ------------- optimizers/over9000/ranger.py | 12 -- optimizers/over9000/rangerlars.py | 14 -- optimizers/over9000/schedulers.py | 32 ----- 21 files changed, 2290 deletions(-) delete mode 100644 optimizers/over9000/__init__.py delete mode 100644 optimizers/over9000/adabelief.py delete mode 100644 optimizers/over9000/adamod.py delete mode 100644 optimizers/over9000/adan.py delete mode 100644 optimizers/over9000/apollo.py delete mode 100644 optimizers/over9000/diffgrad.py delete mode 100644 optimizers/over9000/lamb.py delete mode 100644 optimizers/over9000/lookahead.py delete mode 100644 optimizers/over9000/losses.py delete mode 100644 optimizers/over9000/madam.py delete mode 100644 optimizers/over9000/madgrad.py delete mode 100644 optimizers/over9000/meters.py delete mode 100644 optimizers/over9000/metrics.py delete mode 100644 optimizers/over9000/mixup.py delete mode 100644 optimizers/over9000/novograd.py delete mode 100644 optimizers/over9000/optimizers.py delete mode 100644 optimizers/over9000/radam.py delete mode 100644 optimizers/over9000/ralamb.py delete mode 100644 optimizers/over9000/ranger.py delete mode 100644 optimizers/over9000/rangerlars.py delete mode 100644 optimizers/over9000/schedulers.py diff --git a/optimizers/over9000/__init__.py b/optimizers/over9000/__init__.py deleted file mode 100644 index 7af485f..0000000 --- a/optimizers/over9000/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from .radam import PlainRAdam,RAdam -from .novograd import Novograd -from .ranger import Ranger -from .ralamb import Ralamb -from .rangerlars import RangerLars -from .lookahead import Lookahead,LookaheadAdam -#from .lamb import Lamb -from .diffgrad import DiffGrad -from .adamod import AdaMod -from .madam import Madam -from .apollo import Apollo -from .adabelief import AdaBelief -from .madgrad import MADGRAD -from .adan import Adan \ No newline at end of file diff --git a/optimizers/over9000/adabelief.py b/optimizers/over9000/adabelief.py deleted file mode 100644 index 2f817d6..0000000 --- a/optimizers/over9000/adabelief.py +++ /dev/null @@ -1,222 +0,0 @@ -# from https://github.com/raw/juntang-zhuang/Adabelief-Optimizer/master/PyTorch_Experiments/AdaBelief.py -import math -import torch -from torch.optim.optimizer import Optimizer -from tabulate import tabulate -from colorama import Fore, Back, Style - -version_higher = ( torch.__version__ >= "1.5.0" ) - -class AdaBelief(Optimizer): - r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-16) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - weight_decouple (boolean, optional): ( default: True) If set as True, then - the optimizer uses decoupled weight decay as in AdamW - fixed_decay (boolean, optional): (default: False) This is used when weight_decouple - is set as True. - When fixed_decay == True, the weight decay is performed as - $W_{new} = W_{old} - W_{old} \times decay$. - When fixed_decay == False, the weight decay is performed as - $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the - weight decay ratio decreases with learning rate (lr). - rectify (boolean, optional): (default: True) If set as True, then perform the rectified - update similar to RAdam - degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update - when variance of gradient is high - reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, - weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True, - degenerated_to_sgd=True): - - # ------------------------------------------------------------------------------ - # Print modifications to default arguments - print(Fore.RED + 'Please check your arguments if you have upgraded adabelief-pytorch from version 0.0.5.') - print(Fore.RED + 'Modifications to default arguments:') - default_table = tabulate([ - ['adabelief-pytorch=0.0.5','1e-8','False','False'], - ['Current version (0.1.0)','1e-16','True','True']], - headers=['eps','weight_decouple','rectify']) - print(Fore.RED + default_table) - - print(Fore.RED +'For a complete table of recommended hyperparameters, see') - print(Fore.RED + 'https://github.com/juntang-zhuang/Adabelief-Optimizer') - - print(Style.RESET_ALL) - # ------------------------------------------------------------------------------ - - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - - self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): - for param in params: - if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): - param['buffer'] = [[None, None, None] for _ in range(10)] - - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)]) - super(AdaBelief, self).__init__(params, defaults) - - self.degenerated_to_sgd = degenerated_to_sgd - self.weight_decouple = weight_decouple - self.rectify = rectify - self.fixed_decay = fixed_decay - if self.weight_decouple: - print('Weight decoupling enabled in AdaBelief') - if self.fixed_decay: - print('Weight decay fixed') - if self.rectify: - print('Rectification enabled in AdaBelief') - if amsgrad: - print('AMSGrad enabled in AdaBelief') - - def __setstate__(self, state): - super(AdaBelief, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def reset(self): - for group in self.param_groups: - for p in group['params']: - state = self.state[p] - amsgrad = group['amsgrad'] - - # State initialization - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - # Exponential moving average of squared gradient values - state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'AdaBelief does not support sparse gradients, please consider SparseAdam instead') - amsgrad = group['amsgrad'] - - state = self.state[p] - - beta1, beta2 = group['betas'] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ - if version_higher else torch.zeros_like(p.data) - - # get current state variable - exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var'] - - state['step'] += 1 - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - # Update first and second moment running average - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - grad_residual = grad - exp_avg - exp_avg_var.mul_(beta2).addcmul_( grad_residual, grad_residual, value=1 - beta2) - - if amsgrad: - max_exp_avg_var = state['max_exp_avg_var'] - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var) - - # Use the max. for normalizing running avg. of gradient - denom = (max_exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) - else: - denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) - - # perform weight decay, check if decoupled weight decay - if self.weight_decouple: - if not self.fixed_decay: - p.data.mul_(1.0 - group['lr'] * group['weight_decay']) - else: - p.data.mul_(1.0 - group['weight_decay']) - else: - if group['weight_decay'] != 0: - grad.add_(p.data, alpha=group['weight_decay']) - - # update - if not self.rectify: - # Default update - step_size = group['lr'] / bias_correction1 - p.data.addcdiv_( exp_avg, denom, value=-step_size) - - else: # Rectified update, forked from RAdam - buffered = group['buffer'][int(state['step'] % 10)] - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = math.sqrt( - (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( - N_sma_max - 2)) / (1 - beta1 ** state['step']) - elif self.degenerated_to_sgd: - step_size = 1.0 / (1 - beta1 ** state['step']) - else: - step_size = -1 - buffered[2] = step_size - - if N_sma >= 5: - denom = exp_avg_var.sqrt().add_(group['eps']) - p.data.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) - elif step_size > 0: - p.data.add_( exp_avg, alpha=-step_size * group['lr']) - - return loss \ No newline at end of file diff --git a/optimizers/over9000/adamod.py b/optimizers/over9000/adamod.py deleted file mode 100644 index b345560..0000000 --- a/optimizers/over9000/adamod.py +++ /dev/null @@ -1,98 +0,0 @@ -# original repo https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py -import math -import torch -from torch.optim import Optimizer - -class AdaMod(Optimizer): - """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) - It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), beta3=0.999, - eps=1e-8, weight_decay=0): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= beta3 < 1.0: - raise ValueError("Invalid beta3 parameter: {}".format(beta3)) - defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, - weight_decay=weight_decay) - super(AdaMod, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdaMod, self).__setstate__(state) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'AdaMod does not support sparse gradients') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - # Exponential moving average of actual learning rates - state['exp_avg_lr'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - if group['weight_decay'] != 0: - p.data.add_(-group['weight_decay'] * group['lr'], p.data) - - # Applies momental bounds on actual learning rates - step_size = torch.full_like(denom, step_size) - step_size.div_(denom) - exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) - step_size = torch.min(step_size, exp_avg_lr) - step_size.mul_(exp_avg) - - p.data.add_(-step_size) - - return loss diff --git a/optimizers/over9000/adan.py b/optimizers/over9000/adan.py deleted file mode 100644 index 8206a92..0000000 --- a/optimizers/over9000/adan.py +++ /dev/null @@ -1,156 +0,0 @@ -# https://github.com/raw/sail-sg/Adan/main/adan.py - -# Copyright 2022 Garena Online Private Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import math -import torch -from torch.optim.optimizer import Optimizer -from timm.utils import * - - -class Adan(Optimizer): - """ - Implements a pytorch variant of Adan - - Adan was proposed in - Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. - https://arxiv.org/abs/2208.06677 - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining parameter groups. - lr (float, optional): learning rate. (default: 1e-3) - betas (Tuple[float, float, flot], optional): coefficients used for computing - running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) - eps (float, optional): term added to the denominator to improve - numerical stability. (default: 1e-8) - weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) - max_grad_norm (float, optional): value used to clip - global grad norm (default: 0.0 no clip) - no_prox (bool): how to perform the decoupled weight decay (default: False) - """ - - def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, - weight_decay=0.0, max_grad_norm=0.0, no_prox=False): - if not 0.0 <= max_grad_norm: - raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= betas[2] < 1.0: - raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, - max_grad_norm=max_grad_norm, no_prox=no_prox) - super(Adan, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Adan, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('no_prox', False) - - @torch.no_grad() - def restart_opt(self): - for group in self.param_groups: - group['step'] = 0 - for p in group['params']: - if p.requires_grad: - state = self.state[p] - # State initialization - - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p) - # Exponential moving average of gradient difference - state['exp_avg_diff'] = torch.zeros_like(p) - - @torch.no_grad() - def step(self): - """ - Performs a single optimization step. - """ - if self.defaults['max_grad_norm'] > 0: - device = self.param_groups[0]['params'][0].device - global_grad_norm = torch.zeros(1, device=device) - - max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) - for group in self.param_groups: - - for p in group['params']: - if p.grad is not None: - grad = p.grad - global_grad_norm.add_(grad.pow(2).sum()) - - global_grad_norm = torch.sqrt(global_grad_norm) - - clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) - else: - clip_global_grad_norm = 1.0 - - for group in self.param_groups: - beta1, beta2, beta3 = group['betas'] - # assume same step across group now to simplify things - # per parameter step can be easily support by making it tensor, or pass list into kernel - if 'step' in group: - group['step'] += 1 - else: - group['step'] = 1 - - bias_correction1 = 1.0 - beta1 ** group['step'] - - bias_correction2 = 1.0 - beta2 ** group['step'] - - bias_correction3 = 1.0 - beta3 ** group['step'] - - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - if len(state) == 0: - state['exp_avg'] = torch.zeros_like(p) - state['exp_avg_sq'] = torch.zeros_like(p) - state['exp_avg_diff'] = torch.zeros_like(p) - - grad = p.grad.mul_(clip_global_grad_norm) - if 'pre_grad' not in state or group['step'] == 1: - state['pre_grad'] = grad - - copy_grad = grad.clone() - - exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] - diff = grad - state['pre_grad'] - - update = grad + beta2 * diff - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t - exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t - exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t - - denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) - update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) - - if group['no_prox']: - p.data.mul_(1 - group['lr'] * group['weight_decay']) - p.add_(update, alpha=-group['lr']) - else: - p.add_(update, alpha=-group['lr']) - p.data.div_(1 + group['lr'] * group['weight_decay']) - - state['pre_grad'] = copy_grad diff --git a/optimizers/over9000/apollo.py b/optimizers/over9000/apollo.py deleted file mode 100644 index 2dd8656..0000000 --- a/optimizers/over9000/apollo.py +++ /dev/null @@ -1,115 +0,0 @@ -# from https://github.com/raw/XuezheMax/apollo/master/optim/apollo.py -import torch -from torch.optim.optimizer import Optimizer - - -class Apollo(Optimizer): - r"""Implements Atom algorithm. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float): learning rate - beta (float, optional): coefficient used for computing - running averages of gradient (default: 0.9) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-4) - warmup (int, optional): number of warmup steps (default: 0) - init_lr (float, optional): initial learning rate for warmup (default: 0.01) - weight_decay (float, optional): weight decay coefficient (default: 0) - - """ - - def __init__(self, params, lr, beta=0.9, eps=1e-4, warmup=100, init_lr=0.01, weight_decay=0): - if not 0.0 < lr: - raise ValueError("Invalid learning rate value: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= beta < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(beta)) - if not 0.0 <= weight_decay: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - if not 0.0 <= warmup: - raise ValueError("Invalid warmup updates: {}".format(warmup)) - if not 0.0 <= init_lr <= 1.0: - raise ValueError("Invalid initial learning rate: {}".format(init_lr)) - - defaults = dict(lr=lr, beta=beta, eps=eps, warmup=warmup, - init_lr=init_lr, base_lr=lr, weight_decay=weight_decay) - super(Apollo, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Apollo, self).__setstate__(state) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg_grad'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Exponential moving average of squared gradient values - state['approx_hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Previous update direction - state['update'] = torch.zeros_like(p, memory_format=torch.preserve_format) - - # Calculate current lr - if state['step'] < group['warmup']: - curr_lr = (group['base_lr'] - group['init_lr']) * state['step'] / group['warmup'] + group['init_lr'] - else: - curr_lr = group['lr'] - - # Perform optimization step - grad = p.grad - if grad.is_sparse: - raise RuntimeError('Atom does not support sparse gradients.') - - # Perform step weight decay - if group['weight_decay'] != 0: - grad = grad.add(p, alpha=group['weight_decay']) - - beta = group['beta'] - exp_avg_grad = state['exp_avg_grad'] - B = state['approx_hessian'] - d_p = state['update'] - - state['step'] += 1 - bias_correction = 1 - beta ** state['step'] - alpha = (1 - beta) / bias_correction - - # Update the running average grad - delta_grad = grad - exp_avg_grad - exp_avg_grad.add_(delta_grad, alpha=alpha) - - denom = d_p.norm(p=4).add(group['eps']) - d_p.div_(denom) - v_sq = d_p.mul(d_p) - delta = delta_grad.div_(denom).mul_(d_p).sum().mul(-alpha) - B.mul(v_sq).sum() - - # Update B - B.addcmul_(v_sq, delta) - - # calc direction of parameter updates - denom = B.abs().clamp_(min=1) - d_p.copy_(exp_avg_grad.div(denom)) - - p.add_(d_p, alpha=-curr_lr) - - return loss diff --git a/optimizers/over9000/diffgrad.py b/optimizers/over9000/diffgrad.py deleted file mode 100644 index 8295de9..0000000 --- a/optimizers/over9000/diffgrad.py +++ /dev/null @@ -1,127 +0,0 @@ - -import math -import torch -from torch.optim.optimizer import Optimizer -import numpy as np -import torch.nn as nn - -# Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py - -# modifications: @lessw2020 -# https://github.com/lessw2020/Best-Deep-Learning-Optimizers/blob/master/diffgrad/diffgrad.py - -class DiffGrad(Optimizer): - r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. - It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - .. _diffGrad: An Optimization Method for Convolutional Neural Networks: - https://arxiv.org/abs/1909.11015 - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - - - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - - super().__init__(params, defaults) - - #save version - self.version = version - - def __setstate__(self, state): - super().__setstate__(state) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - # Previous gradient - state['previous_grad'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad.add_(group['weight_decay'], p.data) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - # compute diffgrad coefficient (dfc) - - - if self.version==0: - diff = abs(previous_grad - grad) - elif self.version ==1: - diff = previous_grad-grad - elif self.version ==2: - diff = .5*abs(previous_grad - grad) - - if self.version==0 or self.version==1: - dfc = 1. / (1. + torch.exp(-diff)) - elif self.version==2: - dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 - - state['previous_grad'] = grad - - # update momentum with dfc - exp_avg1 = exp_avg * dfc - - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - p.data.addcdiv_(-step_size, exp_avg1, denom) - - return loss \ No newline at end of file diff --git a/optimizers/over9000/lamb.py b/optimizers/over9000/lamb.py deleted file mode 100644 index 67adb67..0000000 --- a/optimizers/over9000/lamb.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Lamb optimizer.""" -# from https://github.com/cybertronai/pytorch-lamb/blob/master/pytorch_lamb/lamb.py - -import collections -import math - -import torch -from torch.optim import Optimizer - -try: - from tensorboardX import SummaryWriter - - def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int): - """Log a histogram of trust ratio scalars in across layers.""" - results = collections.defaultdict(list) - for group in optimizer.param_groups: - for p in group['params']: - state = optimizer.state[p] - for i in ('weight_norm', 'adam_norm', 'trust_ratio'): - if i in state: - results[i].append(state[i]) - - for k, v in results.items(): - event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count) -except ModuleNotFoundError as e: - print("To use this log_lamb_rs, please run 'pip install tensorboardx'. Also you must have Tensorboard running to see results") - -class Lamb(Optimizer): - r"""Implements Lamb algorithm. - - It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - adam (bool, optional): always use trust ratio = 1, which turns this into - Adam. Useful for comparison purposes. - - .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes: - https://arxiv.org/abs/1904.00962 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, - weight_decay=0, adam=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay) - self.adam = adam - super(Lamb, self).__init__(params, defaults) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - # Decay the first and second moment running average coefficient - # m_t - exp_avg.mul_(beta1).add_(1 - beta1, grad) - # v_t - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - # Paper v3 does not use debiasing. - # bias_correction1 = 1 - beta1 ** state['step'] - # bias_correction2 = 1 - beta2 ** state['step'] - # Apply bias to lr to avoid broadcast. - step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1 - - weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) - - adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps']) - if group['weight_decay'] != 0: - adam_step.add_(group['weight_decay'], p.data) - - adam_norm = adam_step.pow(2).sum().sqrt() - if weight_norm == 0 or adam_norm == 0: - trust_ratio = 1 - else: - trust_ratio = weight_norm / adam_norm - state['weight_norm'] = weight_norm - state['adam_norm'] = adam_norm - state['trust_ratio'] = trust_ratio - if self.adam: - trust_ratio = 1 - - p.data.add_(-step_size * trust_ratio, adam_step) - - return loss diff --git a/optimizers/over9000/lookahead.py b/optimizers/over9000/lookahead.py deleted file mode 100644 index bbecaf3..0000000 --- a/optimizers/over9000/lookahead.py +++ /dev/null @@ -1,97 +0,0 @@ -# Lookahead implementation from https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/lookahead.py - -""" Lookahead Optimizer Wrapper. -Implementation modified from: https://github.com/alphadl/lookahead.pytorch -Paper: `Lookahead Optimizer: k steps forward, 1 step back` - https://arxiv.org/abs/1907.08610 -""" -import torch -from torch.optim.optimizer import Optimizer -from torch.optim import Adam -from collections import defaultdict - -class Lookahead(Optimizer): - def __init__(self, base_optimizer, alpha=0.5, k=6): - if not 0.0 <= alpha <= 1.0: - raise ValueError(f'Invalid slow update rate: {alpha}') - if not 1 <= k: - raise ValueError(f'Invalid lookahead steps: {k}') - defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0) - self.base_optimizer = base_optimizer - self.param_groups = self.base_optimizer.param_groups - self.defaults = base_optimizer.defaults - self.defaults.update(defaults) - self.state = defaultdict(dict) - # manually add our defaults to the param groups - for name, default in defaults.items(): - for group in self.param_groups: - group.setdefault(name, default) - - def update_slow(self, group): - for fast_p in group["params"]: - if fast_p.grad is None: - continue - param_state = self.state[fast_p] - if 'slow_buffer' not in param_state: - param_state['slow_buffer'] = torch.empty_like(fast_p.data) - param_state['slow_buffer'].copy_(fast_p.data) - slow = param_state['slow_buffer'] - slow.add_(group['lookahead_alpha'], fast_p.data - slow) - fast_p.data.copy_(slow) - - def sync_lookahead(self): - for group in self.param_groups: - self.update_slow(group) - - def step(self, closure=None): - # print(self.k) - #assert id(self.param_groups) == id(self.base_optimizer.param_groups) - loss = self.base_optimizer.step(closure) - for group in self.param_groups: - group['lookahead_step'] += 1 - if group['lookahead_step'] % group['lookahead_k'] == 0: - self.update_slow(group) - return loss - - def state_dict(self): - fast_state_dict = self.base_optimizer.state_dict() - slow_state = { - (id(k) if isinstance(k, torch.Tensor) else k): v - for k, v in self.state.items() - } - fast_state = fast_state_dict['state'] - param_groups = fast_state_dict['param_groups'] - return { - 'state': fast_state, - 'slow_state': slow_state, - 'param_groups': param_groups, - } - - def load_state_dict(self, state_dict): - fast_state_dict = { - 'state': state_dict['state'], - 'param_groups': state_dict['param_groups'], - } - self.base_optimizer.load_state_dict(fast_state_dict) - - # We want to restore the slow state, but share param_groups reference - # with base_optimizer. This is a bit redundant but least code - slow_state_new = False - if 'slow_state' not in state_dict: - print('Loading state_dict from optimizer without Lookahead applied.') - state_dict['slow_state'] = defaultdict(dict) - slow_state_new = True - slow_state_dict = { - 'state': state_dict['slow_state'], - 'param_groups': state_dict['param_groups'], # this is pointless but saves code - } - super(Lookahead, self).load_state_dict(slow_state_dict) - self.param_groups = self.base_optimizer.param_groups # make both ref same container - if slow_state_new: - # reapply defaults to catch missing lookahead specific ones - for name, default in self.defaults.items(): - for group in self.param_groups: - group.setdefault(name, default) - -def LookaheadAdam(params, alpha=0.5, k=6, *args, **kwargs): - adam = Adam(params, *args, **kwargs) - return Lookahead(adam, alpha, k) diff --git a/optimizers/over9000/losses.py b/optimizers/over9000/losses.py deleted file mode 100644 index a87dad1..0000000 --- a/optimizers/over9000/losses.py +++ /dev/null @@ -1,226 +0,0 @@ - -import torch.nn as nn -import torch -import torch.nn.functional as F - - -class FocalLoss(nn.Module): - __name__='focal_loss' - def __init__(self, alpha=1, gamma=2, logits=True, reduction = 'mean'): - super(FocalLoss, self).__init__() - assert reduction in ['sum','mean','none'] - self.alpha = alpha - self.gamma = gamma - self.logits = logits - self.reduction = reduction - - def forward(self, inputs, targets): - if self.logits: - BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False) - else: - BCE_loss = F.binary_cross_entropy(inputs, targets, reduce=False) - - pt = torch.exp(-BCE_loss) - F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss - - if self.reduction == 'mean': - return F_loss.mean() - elif self.reduction == 'sum': - return F_loss.sum() - return F_loss - -class BCEWithLogitsLoss(nn.BCEWithLogitsLoss): - def __init__(self, **kwargs): - if 'pos_weight' in kwargs.keys(): - kwargs['pos_weight'] = torch.tensor(kwargs['pos_weight'],requires_grad=False) - super().__init__(**kwargs) - -def get_cls_loss(**kwargs): - cls_loss_name = kwargs['name'] - params = kwargs['params'] - if cls_loss_name == 'bce': - loss_fnc = BCEWithLogitsLoss - elif cls_loss_name =='focal': - loss_fnc = FocalLoss - else: - raise NotImplementedError(f'Loss {cls_loss_name} not implemented!!!') - - return cls_loss_name,loss_fnc(**params) - -def get_reg_loss(**kwargs): - reg_loss_name = kwargs['name'] - params = kwargs['params'] - - if reg_loss_name == 'l1': - loss_fnc = nn.L1Loss - elif reg_loss_name in ['mse','rmse']: - loss_fnc = nn.MSELoss - else: - raise NotImplementedError(f'Loss {reg_loss_name} not implemented!!!') - - return reg_loss_name,loss_fnc(**params) - -class DualClsRegLoss(nn.Module): - - def __init__( - self, - cls_loss_kwargs, - reg_loss_kwargs, - eliminate_neg_cls_targets=False, - w1=1.0, - w2=1.0 - ): - super().__init__() - - self.cls_criterion_name,self.cls_criterion = get_cls_loss(**cls_loss_kwargs) - self.reg_criterion_name,self.reg_criterion = get_reg_loss(**reg_loss_kwargs) - self.enct = eliminate_neg_cls_targets - self.register_buffer('w1',torch.tensor(w1)) - self.register_buffer('w2',torch.tensor(w2)) - - def register_losses(self,reg_loss,cls_loss): - self.registered_losses = { - self.reg_criterion_name : reg_loss, - self.cls_criterion_name : cls_loss - } - - def get_registered_losses(self): - return self.registered_losses - - def forward(self,cls_gt,cls_pred,reg_gt,reg_pred): - - - loss = 0.0 - - cls_loss = self.cls_criterion(cls_pred,cls_gt) - loss = loss + self.w1 * cls_loss - - apply_reg = True - reg_loss = 0.0 - - if self.enct: - mask = reg_gt != 0 - apply_reg = mask.any().item() - reg_pred = reg_pred[mask] - reg_gt = reg_gt[mask] - - if apply_reg: - - reg_loss = self.reg_criterion(reg_pred,reg_gt) - - if self.reg_criterion_name == 'rmse': - reg_loss = torch.sqrt(reg_loss) - - loss = loss + self.w2 * reg_loss - - self.register_losses(reg_loss=reg_loss.item() if apply_reg else reg_loss,cls_loss=cls_loss.item()) - return loss - - -class TripleClsLoss(nn.Module): - - def __init__( - self, - cls_loss_kwargs, - base_loss_kwargs, - shift_loss_kwargs, - eliminate_neg_cls_targets=False, - w1=1.0, - w2=1.0, - w3=1.0 - ): - super().__init__() - - self.cls_criterion_name,self.cls_criterion = get_cls_loss(**cls_loss_kwargs) - self.base_criterion = nn.CrossEntropyLoss(**base_loss_kwargs) - self.shift_criterion = nn.CrossEntropyLoss(**shift_loss_kwargs) - - self.enct = eliminate_neg_cls_targets - self.register_buffer('w1',torch.tensor(w1)) - self.register_buffer('w2',torch.tensor(w2)) - self.register_buffer('w3',torch.tensor(w3)) - - def register_losses(self,base_loss,shift_loss,cls_loss): - self.registered_losses = { - 'base_ce_loss' : base_loss, - 'shift_ce_loss' : shift_loss, - self.cls_criterion_name : cls_loss - } - - def get_registered_losses(self): - return self.registered_losses - - def forward(self,cls_gt,cls_pred,base_gt,base_pred,shift_gt,shift_pred): - - - loss = 0.0 - - cls_loss = self.cls_criterion(cls_pred,cls_gt) - loss = loss + self.w1 * cls_loss - - apply_reg = True - base_loss = 0.0 - shift_loss = 0.0 - - if self.enct: - mask = cls_gt != 0 - mask = mask[:,0] - apply_reg = mask.any().item() - - base_pred = base_pred[mask,:] - base_gt = base_gt[mask] - - shift_pred = shift_pred[mask,:] - shift_gt = shift_gt[mask] - - if apply_reg: - - base_loss = self.base_criterion(base_pred,base_gt) - shift_loss = self.shift_criterion(shift_pred,shift_gt) - - loss = loss + self.w2 * base_loss + self.w3 * shift_loss - - self.register_losses( - base_loss=base_loss.item() if apply_reg else base_loss, - shift_loss=shift_loss.item() if apply_reg else shift_loss, - cls_loss=cls_loss.item() - ) - - return loss - - - - - - - -if __name__ == '__main__': - - cls_gt = torch.tensor([1,0,1,0,1,1,0,0,1,1,1,0,0]).float().view(-1,1) - cls_pred = torch.tensor([1.22,-5,0,4,-0.4,2,-3,1,3,4,1.5,0.5,-5]).view(-1,1) - - reg_gt = torch.tensor([0,0,1,2,3,4,5,6,7,8,9,10,11]).float().view(-1,1) - reg_pred = torch.tensor([0,0,1,2,3,4,5,6,7,8,9,10,11]).view(-1,1) * 0.8 - print(torch.sigmoid(cls_pred)) - - loss_dict = dict( - cls = dict( - name = 'bce', - params = dict( - pos_weight=1.0, - reduction = 'mean') - ), - - reg = dict( - name = 'rmse', - params = dict(reduction='mean') - ) - ) - - criterion = DualClsRegLoss(cls_loss_kwargs=loss_dict['cls'],reg_loss_kwargs=loss_dict['reg'],eliminate_neg_cls_targets=True) - print(criterion) - - with torch.no_grad(): - loss = criterion(cls_gt,cls_pred,reg_gt,reg_pred) - print(loss) - print(criterion.get_registered_losses()) \ No newline at end of file diff --git a/optimizers/over9000/madam.py b/optimizers/over9000/madam.py deleted file mode 100644 index 4212701..0000000 --- a/optimizers/over9000/madam.py +++ /dev/null @@ -1,47 +0,0 @@ -# from here https://github.com/jxbz/madam/blob/master/pytorch/optim/madam.py -import torch -from torch.optim.optimizer import Optimizer, required - - -class Madam(Optimizer): - - def __init__(self, params, lr=0.01, p_scale=3.0, g_bound=10.0): - - self.p_scale = p_scale - self.g_bound = g_bound - defaults = dict(lr=lr) - super(Madam, self).__init__(params, defaults) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - - state = self.state[p] - if len(state) == 0: - state['max'] = self.p_scale*(p*p).mean().sqrt().item() - state['step'] = 0 - state['exp_avg_sq'] = torch.zeros_like(p) - - state['step'] += 1 - bias_correction = 1 - 0.999 ** state['step'] - state['exp_avg_sq'] = 0.999 * state['exp_avg_sq'] + 0.001 * p.grad.data**2 - - g_normed = p.grad.data / (state['exp_avg_sq']/bias_correction).sqrt() - g_normed[torch.isnan(g_normed)] = 0 - g_normed.clamp_(-self.g_bound, self.g_bound) - - p.data *= torch.exp( -group['lr']*g_normed*torch.sign(p.data) ) - p.data.clamp_(-state['max'], state['max']) - - return loss diff --git a/optimizers/over9000/madgrad.py b/optimizers/over9000/madgrad.py deleted file mode 100644 index bcbc90e..0000000 --- a/optimizers/over9000/madgrad.py +++ /dev/null @@ -1,174 +0,0 @@ -# from https://github.com/raw/facebookresearch/madgrad/master/madgrad/madgrad.py -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math -from typing import TYPE_CHECKING, Any, Callable, Optional - -import torch -import torch.optim - -if TYPE_CHECKING: - from torch.optim.optimizer import _params_t -else: - _params_t = Any - - -class MADGRAD(torch.optim.Optimizer): - """ - MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic - Optimization. - - .. _MADGRAD: https://arxiv.org/abs/2101.11075 - - MADGRAD is a general purpose optimizer that can be used in place of SGD or - Adam may converge faster and generalize better. Currently GPU-only. - Typically, the same learning rate schedule that is used for SGD or Adam may - be used. The overall learning rate is not comparable to either method and - should be determined by a hyper-parameter sweep. - - MADGRAD requires less weight decay than other methods, often as little as - zero. Momentum values used for SGD or Adam's beta1 should work here also. - - On sparse problems both weight_decay and momentum should be set to 0. - - Arguments: - params (iterable): - Iterable of parameters to optimize or dicts defining parameter groups. - lr (float): - Learning rate (default: 1e-2). - momentum (float): - Momentum value in the range [0,1) (default: 0.9). - weight_decay (float): - Weight decay, i.e. a L2 penalty (default: 0). - eps (float): - Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). - """ - - def __init__( - self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6, - ): - if momentum < 0 or momentum >= 1: - raise ValueError(f"Momentum {momentum} must be in the range [0,1]") - if lr <= 0: - raise ValueError(f"Learning rate {lr} must be positive") - if weight_decay < 0: - raise ValueError(f"Weight decay {weight_decay} must be non-negative") - if eps < 0: - raise ValueError(f"Eps must be non-negative") - - defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, k=0) - super().__init__(params, defaults) - - for group in self.param_groups: - for p in group["params"]: - state = self.state[p] - - state["grad_sum_sq"] = torch.zeros_like(p.data).detach() - state["s"] = torch.zeros_like(p.data).detach() - if momentum != 0: - state["x0"] = torch.clone(p.data).detach() - - @property - def supports_memory_efficient_fp16(self) -> bool: - return False - - @property - def supports_flat_params(self) -> bool: - return True - - def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - eps = group["eps"] - k = group["k"] - lr = group["lr"] + eps - decay = group["weight_decay"] - momentum = group["momentum"] - - ck = 1 - momentum - lamb = lr * math.pow(k + 1, 0.5) - - for p in group["params"]: - if p.grad is None: - continue - grad = p.grad.data - state = self.state[p] - - if momentum != 0.0 and grad.is_sparse: - raise RuntimeError("momentum != 0 is not compatible with sparse gradients") - - grad_sum_sq = state["grad_sum_sq"] - s = state["s"] - - # Apply weight decay - if decay != 0: - if grad.is_sparse: - raise RuntimeError("weight_decay option is not compatible with sparse gradients") - - grad.add_(p.data, alpha=decay) - - if grad.is_sparse: - grad = grad.coalesce() - grad_val = grad._values() - - p_masked = p.sparse_mask(grad) - grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) - s_masked = s.sparse_mask(grad) - - # Compute x_0 from other known quantities - rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) - x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1) - - # Dense + sparse op - grad_sq = grad * grad - grad_sum_sq.add_(grad_sq, alpha=lamb) - grad_sum_sq_masked.add_(grad_sq, alpha=lamb) - - rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) - - s.add_(grad, alpha=lamb) - s_masked._values().add_(grad_val, alpha=lamb) - - # update masked copy of p - p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1) - # Copy updated masked p to dense p using an add operation - p_masked._values().add_(p_kp1_masked_vals, alpha=-1) - p.data.add_(p_masked, alpha=-1) - else: - if momentum == 0: - # Compute x_0 from other known quantities - rms = grad_sum_sq.pow(1 / 3).add_(eps) - x0 = p.data.addcdiv(s, rms, value=1) - else: - x0 = state["x0"] - - # Accumulate second moments - grad_sum_sq.addcmul_(grad, grad, value=lamb) - rms = grad_sum_sq.pow(1 / 3).add_(eps) - - # Update s - s.data.add_(grad, alpha=lamb) - - # Step - if momentum == 0: - p.data.copy_(x0.addcdiv(s, rms, value=-1)) - else: - z = x0.addcdiv(s, rms, value=-1) - - # p is a moving average of z - p.data.mul_(1 - ck).add_(z, alpha=ck) - - group["k"] = group["k"] + 1 - return loss diff --git a/optimizers/over9000/meters.py b/optimizers/over9000/meters.py deleted file mode 100644 index d5e4129..0000000 --- a/optimizers/over9000/meters.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np -import torch - -# meters -class Meter(object): - def reset(self): - pass - - def update(self, value): - pass - - def get_update(self): - pass - - def set_name(self, name): - self.name = f'{name}_{self.kind}' - - -class AverageMeter(Meter): - def __init__(self): - super(AverageMeter, self).__init__() - self.reset() - self.kind = 'avg' - - def reset(self): - self.value = 0.0 - self.average = 0.0 - self.count = 0.0 - - def update(self, value): - self.count += 1 - self.value = value - self.average = ((self.average * (self.count - 1)) + self.value) / float(self.count) - - def update_all(self, values): - l = len(values) - self.sum = np.sum(values) - self.count += l - self.average = ((self.average * (self.count - l)) + self.sum) / float(self.count) - - def get_update(self): - return self.average - - -def meter_dict(channel_names): - d = {channel_name : AverageMeter() for channel_name in channel_names} - return d \ No newline at end of file diff --git a/optimizers/over9000/metrics.py b/optimizers/over9000/metrics.py deleted file mode 100644 index 1964703..0000000 --- a/optimizers/over9000/metrics.py +++ /dev/null @@ -1,92 +0,0 @@ -import torch -import torch.nn as nn -#import pandas as pd - - -def _threshold(x, threshold=0.5): - if threshold is not None: - return (x >= threshold).type(x.dtype) - else: - return x - -class RMSE_METER(nn.Module): - def __init__(self): - self.rmse_values = None - - @torch.no_grad() - def __call__(self,y_pred,y_gt): - rmse_values = ((y_pred - y_gt) ** 2).view(-1) - if self.rmse_values is None: - self.rmse_values = rmse_values - else: - self.rmse_values = torch.cat([self.rmse_values,rmse_values],dim=0) - - self.rmse_avg = torch.sqrt(self.rmse_values.mean()).item() - - return self.rmse_avg - -class MicroScores: - def __init__(self,threshold = 0.5,ignore_index=-100): - self.thresh = threshold - self.eps = 1e-8 - self.ignore_index = ignore_index - self.reset() - - def reset(self): - self.tp = 0. - self.fp = 0. - self.fn = 0. - self.tn = 0. - - def get_scores(self): - eps = self.eps - tp, fp, tn, fn = self.tp, self.fp, self.tn, self.fn - - recall =(tp + eps) / (tp + fn + eps) - precision = (tp + eps) / (tp + fp + eps) - - iou = (tp + eps) / (tp + fp + fn + eps) - f1_score = ((2 * recall * precision) + eps) / (recall + precision + eps) - - return { - 'recall' : recall.mean().item(), - 'precision' : precision.mean().item(), - 'iou' : iou.mean().item(), - 'f1' : f1_score.mean().item() - } - - @torch.no_grad() - def __call__(self, y_pr,y_gt,thresh_gt=False): - - #b,c,h,w = y_pr.shape - y_pr = _threshold(y_pr,self.thresh) - if thresh_gt: - y_gt = _threshold(y_gt,self.thresh) - - if self.ignore_index is not None: - Y_nindex = (y_gt != self.ignore_index).type(y_gt.dtype) - y_gt = y_gt * Y_nindex - y_pr = y_pr * Y_nindex - - #y_gt = y_gt.view(b,c,-1).float() # >> B,C,HxW - #y_pr = y_pr.view(b,c,-1).float() # >> B,C,HxW - - tp = torch.sum(y_pr * y_gt) - tn = torch.sum((1.0 - y_pr) * (1.0 - y_gt)) - fp = torch.sum(y_pr) - tp - fn = torch.sum(y_gt) - tp - - self.tp += tp - self.fp += fp - self.fn += fn - self.tn += tn - - tp, fp, tn, fn = self.tp, self.fp, self.tn, self.fn - - return self.get_scores() - - - -if __name__ == '__main__': - pass - \ No newline at end of file diff --git a/optimizers/over9000/mixup.py b/optimizers/over9000/mixup.py deleted file mode 100644 index 0069189..0000000 --- a/optimizers/over9000/mixup.py +++ /dev/null @@ -1,59 +0,0 @@ -import torch -import torch.nn as nn -from torch.distributions import Beta - -class Mixup(nn.Module): - def __init__(self, mix_beta,ignore_index=-100): - - super(Mixup, self).__init__() - self.beta_distribution = Beta(mix_beta, mix_beta) - self.ignore_index = ignore_index - - def forward(self, X, Y, Y_mask=None,weight=None): - - bs = X.shape[0] - n_dims = len(X.shape) - perm = torch.randperm(bs) - coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device) - - #BxC - if n_dims == 2: - X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm] - - #BxCxW - elif n_dims == 3: - X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm] - - #BxCxHxW - elif n_dims == 4: - X = coeffs.view(-1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm] - - #BxTxCxHxW - else: - X = coeffs.view(-1, 1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1, 1)) * X[perm] - - - if self.ignore_index is not None: - Y_index = Y == self.ignore_index - Y_index = Y_index.type(Y.dtype) - #import matplotlib.pyplot as plt - #plt.imshow(Y_index[0,0].cpu()) - - #BxCxHxW for segmentation / change it for other purposes - Y = coeffs.view(-1, 1, 1, 1) * Y + (1 - coeffs.view(-1, 1, 1, 1)) * Y[perm] - if self.ignore_index is not None: - Y = self.ignore_index * Y_index + Y * (1 - Y_index) - #X = torch.nan_to_num(X,nan=0.0) - - ret = [X,Y] - - if Y_mask is not None: - Y_mask = Y_mask | Y_mask[perm] - ret.append(Y_mask) - - if weight is not None: - weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm] - ret.append(weight) - - - return ret \ No newline at end of file diff --git a/optimizers/over9000/novograd.py b/optimizers/over9000/novograd.py deleted file mode 100644 index 5eea065..0000000 --- a/optimizers/over9000/novograd.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch.optim import Optimizer -import math - -class AdamW(Optimizer): - """Implements AdamW algorithm. - - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - - Adam: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0, amsgrad=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - amsgrad = group['amsgrad'] - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - if amsgrad: - max_exp_avg_sq = state['max_exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) - # Use the max. for normalizing running avg. of gradient - denom = max_exp_avg_sq.sqrt().add_(group['eps']) - else: - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) - - return loss - -class Novograd(Optimizer): - """ - Implements Novograd algorithm. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.95, 0)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - grad_averaging: gradient averaging - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - """ - - def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, - weight_decay=0, grad_averaging=False, amsgrad=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, - grad_averaging=grad_averaging, - amsgrad=amsgrad) - - super(Novograd, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Novograd, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Sparse gradients are not supported.') - amsgrad = group['amsgrad'] - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - if amsgrad: - max_exp_avg_sq = state['max_exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - norm = torch.sum(torch.pow(grad, 2)) - - if exp_avg_sq == 0: - exp_avg_sq.copy_(norm) - else: - exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) - - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) - # Use the max. for normalizing running avg. of gradient - denom = max_exp_avg_sq.sqrt().add_(group['eps']) - else: - denom = exp_avg_sq.sqrt().add_(group['eps']) - - grad.div_(denom) - if group['weight_decay'] != 0: - grad.add_(group['weight_decay'], p.data) - if group['grad_averaging']: - grad.mul_(1 - beta1) - exp_avg.mul_(beta1).add_(grad) - - p.data.add_(-group['lr'], exp_avg) - - return loss \ No newline at end of file diff --git a/optimizers/over9000/optimizers.py b/optimizers/over9000/optimizers.py deleted file mode 100644 index d0ec97f..0000000 --- a/optimizers/over9000/optimizers.py +++ /dev/null @@ -1,98 +0,0 @@ -import torch -import math -from torch.optim import Adam,RMSprop,Optimizer,SGD -#AdamW -class AdamW(Optimizer): - """Implements AdamW algorithm. - It has been proposed in `Fixing Weight Decay Regularization in Adam`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - .. Fixing Weight Decay Regularization in Adam: - https://arxiv.org/abs/1711.05101 - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay) - super(AdamW, self).__init__(params, defaults) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('AdamW does not support sparse gradients, please consider SparseAdam instead') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - # according to the paper, this penalty should come after the bias correction - # if group['weight_decay'] != 0: - # grad = grad.add(group['weight_decay'], p.data) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - # w = w - wd * lr * w - if group['weight_decay'] != 0: - p.data.add_(-group['weight_decay'] * group['lr'], p.data) - - # w = w - lr * w.grad - p.data.addcdiv_(-step_size, exp_avg, denom) - - # w = w - wd * lr * w - lr * w.grad - # See http://www.fast.ai/2018/07/02/adam-weight-decay/ - - return loss - -#optimizers -def get_optimizer(name ,params,**kwargs): - name = name.lower() - if(name == 'adam'): - return Adam(params,**kwargs) - elif(name == 'adamw'): - return AdamW(params,**kwargs) - elif(name == 'rmsprop'): - return RMSprop(params,**kwargs) - elif(name == 'sgd'): - return SGD(params,**kwargs) - else: - raise ValueError('Optimizer {} not an option'.format(name)) \ No newline at end of file diff --git a/optimizers/over9000/radam.py b/optimizers/over9000/radam.py deleted file mode 100644 index 74d7e0c..0000000 --- a/optimizers/over9000/radam.py +++ /dev/null @@ -1,209 +0,0 @@ -# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py - -import math -import torch -from torch.optim.optimizer import Optimizer, required - -class RAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - self.buffer = [[None, None, None] for ind in range(10)] - super(RAdam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(RAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - state['step'] += 1 - buffered = self.buffer[int(state['step'] % 10)] - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - else: - step_size = 1.0 / (1 - beta1 ** state['step']) - buffered[2] = step_size - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - if N_sma >= 5: - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) - else: - p_data_fp32.add_(-step_size * group['lr'], exp_avg) - - p.data.copy_(p_data_fp32) - - return loss - -class PlainRAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - - super(PlainRAdam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(PlainRAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - state['step'] += 1 - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - else: - step_size = group['lr'] / (1 - beta1 ** state['step']) - p_data_fp32.add_(-step_size, exp_avg) - - p.data.copy_(p_data_fp32) - - return loss - - -class AdamW(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, warmup = warmup) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - - def step(self, closure=None): - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - if group['warmup'] > state['step']: - scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] - else: - scheduled_lr = group['lr'] - - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) - - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - - p.data.copy_(p_data_fp32) - - return loss diff --git a/optimizers/over9000/ralamb.py b/optimizers/over9000/ralamb.py deleted file mode 100644 index f7036c8..0000000 --- a/optimizers/over9000/ralamb.py +++ /dev/null @@ -1,99 +0,0 @@ -import torch, math -from torch.optim.optimizer import Optimizer - -# RAdam + LARS -class Ralamb(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - self.buffer = [[None, None, None] for ind in range(10)] - super(Ralamb, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Ralamb, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Ralamb does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - # Decay the first and second moment running average coefficient - # m_t - exp_avg.mul_(beta1).add_(1 - beta1, grad) - # v_t - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - - state['step'] += 1 - buffered = self.buffer[int(state['step'] % 10)] - - if state['step'] == buffered[0]: - N_sma, radam_step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - radam_step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - else: - radam_step_size = 1.0 / (1 - beta1 ** state['step']) - buffered[2] = radam_step_size - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - radam_step = p_data_fp32.clone() - if N_sma >= 5: - denom = exp_avg_sq.sqrt().add_(group['eps']) - radam_step.addcdiv_(-radam_step_size * group['lr'], exp_avg, denom) - else: - radam_step.add_(-radam_step_size * group['lr'], exp_avg) - - radam_norm = radam_step.pow(2).sum().sqrt() - weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) - if weight_norm == 0 or radam_norm == 0: - trust_ratio = 1 - else: - trust_ratio = weight_norm / radam_norm - - state['weight_norm'] = weight_norm - state['adam_norm'] = radam_norm - state['trust_ratio'] = trust_ratio - - if N_sma >= 5: - p_data_fp32.addcdiv_(-radam_step_size * group['lr'] * trust_ratio, exp_avg, denom) - else: - p_data_fp32.add_(-radam_step_size * group['lr'] * trust_ratio, exp_avg) - - p.data.copy_(p_data_fp32) - - return loss diff --git a/optimizers/over9000/ranger.py b/optimizers/over9000/ranger.py deleted file mode 100644 index b6c8218..0000000 --- a/optimizers/over9000/ranger.py +++ /dev/null @@ -1,12 +0,0 @@ - -import math -import torch -from torch.optim.optimizer import Optimizer, required -import itertools as it -from .lookahead import * -from .radam import * - -def Ranger(params, alpha=0.5, k=6, *args, **kwargs): - radam = RAdam(params, *args, **kwargs) - return Lookahead(radam, alpha, k) - diff --git a/optimizers/over9000/rangerlars.py b/optimizers/over9000/rangerlars.py deleted file mode 100644 index 3fcb39e..0000000 --- a/optimizers/over9000/rangerlars.py +++ /dev/null @@ -1,14 +0,0 @@ -import torch, math -from torch.optim.optimizer import Optimizer -import itertools as it -from .lookahead import * -from .ralamb import * - -# RAdam + LARS + LookAHead - -# Lookahead implementation from https://github.com/lonePatient/lookahead_pytorch/blob/master/optimizer.py -# RAdam + LARS implementation from https://gist.github.com/redknightlois/c4023d393eb8f92bb44b2ab582d7ec20 - -def RangerLars(params, alpha=0.5, k=6, *args, **kwargs): - ralamb = Ralamb(params, *args, **kwargs) - return Lookahead(ralamb, alpha, k) diff --git a/optimizers/over9000/schedulers.py b/optimizers/over9000/schedulers.py deleted file mode 100644 index d230ef2..0000000 --- a/optimizers/over9000/schedulers.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, MultiStepLR,LambdaLR -# polylr - -class PolyLR_WWP(LambdaLR): - def __init__(self, optimizer, epochs, warmup,ratio=0.9): - warmup = min(epochs,max(0,warmup-1)) - decay_epochs = epochs - warmup - xlambda = lambda x : 1.0 if(x Date: Mon, 25 Dec 2023 13:45:38 +0200 Subject: [PATCH 04/20] changed the optimize over9000 implementation --- optimizers/over9000/__init__.py | 14 ++ optimizers/over9000/adabelief.py | 222 +++++++++++++++++++++++++++++ optimizers/over9000/adamod.py | 98 +++++++++++++ optimizers/over9000/adan.py | 156 +++++++++++++++++++++ optimizers/over9000/apollo.py | 115 +++++++++++++++ optimizers/over9000/diffgrad.py | 127 +++++++++++++++++ optimizers/over9000/lamb.py | 129 +++++++++++++++++ optimizers/over9000/lookahead.py | 103 ++++++++++++++ optimizers/over9000/madam.py | 47 +++++++ optimizers/over9000/madgrad.py | 174 +++++++++++++++++++++++ optimizers/over9000/novograd.py | 223 ++++++++++++++++++++++++++++++ optimizers/over9000/radam.py | 209 ++++++++++++++++++++++++++++ optimizers/over9000/ralamb.py | 99 +++++++++++++ optimizers/over9000/ranger.py | 12 ++ optimizers/over9000/rangerlars.py | 14 ++ 15 files changed, 1742 insertions(+) create mode 100644 optimizers/over9000/__init__.py create mode 100644 optimizers/over9000/adabelief.py create mode 100644 optimizers/over9000/adamod.py create mode 100644 optimizers/over9000/adan.py create mode 100644 optimizers/over9000/apollo.py create mode 100644 optimizers/over9000/diffgrad.py create mode 100644 optimizers/over9000/lamb.py create mode 100644 optimizers/over9000/lookahead.py create mode 100644 optimizers/over9000/madam.py create mode 100644 optimizers/over9000/madgrad.py create mode 100644 optimizers/over9000/novograd.py create mode 100644 optimizers/over9000/radam.py create mode 100644 optimizers/over9000/ralamb.py create mode 100644 optimizers/over9000/ranger.py create mode 100644 optimizers/over9000/rangerlars.py diff --git a/optimizers/over9000/__init__.py b/optimizers/over9000/__init__.py new file mode 100644 index 0000000..7af485f --- /dev/null +++ b/optimizers/over9000/__init__.py @@ -0,0 +1,14 @@ +from .radam import PlainRAdam,RAdam +from .novograd import Novograd +from .ranger import Ranger +from .ralamb import Ralamb +from .rangerlars import RangerLars +from .lookahead import Lookahead,LookaheadAdam +#from .lamb import Lamb +from .diffgrad import DiffGrad +from .adamod import AdaMod +from .madam import Madam +from .apollo import Apollo +from .adabelief import AdaBelief +from .madgrad import MADGRAD +from .adan import Adan \ No newline at end of file diff --git a/optimizers/over9000/adabelief.py b/optimizers/over9000/adabelief.py new file mode 100644 index 0000000..2f817d6 --- /dev/null +++ b/optimizers/over9000/adabelief.py @@ -0,0 +1,222 @@ +# from https://github.com/raw/juntang-zhuang/Adabelief-Optimizer/master/PyTorch_Experiments/AdaBelief.py +import math +import torch +from torch.optim.optimizer import Optimizer +from tabulate import tabulate +from colorama import Fore, Back, Style + +version_higher = ( torch.__version__ >= "1.5.0" ) + +class AdaBelief(Optimizer): + r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-16) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + weight_decouple (boolean, optional): ( default: True) If set as True, then + the optimizer uses decoupled weight decay as in AdamW + fixed_decay (boolean, optional): (default: False) This is used when weight_decouple + is set as True. + When fixed_decay == True, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay$. + When fixed_decay == False, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the + weight decay ratio decreases with learning rate (lr). + rectify (boolean, optional): (default: True) If set as True, then perform the rectified + update similar to RAdam + degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update + when variance of gradient is high + reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020 + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, + weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True, + degenerated_to_sgd=True): + + # ------------------------------------------------------------------------------ + # Print modifications to default arguments + print(Fore.RED + 'Please check your arguments if you have upgraded adabelief-pytorch from version 0.0.5.') + print(Fore.RED + 'Modifications to default arguments:') + default_table = tabulate([ + ['adabelief-pytorch=0.0.5','1e-8','False','False'], + ['Current version (0.1.0)','1e-16','True','True']], + headers=['eps','weight_decouple','rectify']) + print(Fore.RED + default_table) + + print(Fore.RED +'For a complete table of recommended hyperparameters, see') + print(Fore.RED + 'https://github.com/juntang-zhuang/Adabelief-Optimizer') + + print(Style.RESET_ALL) + # ------------------------------------------------------------------------------ + + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)]) + super(AdaBelief, self).__init__(params, defaults) + + self.degenerated_to_sgd = degenerated_to_sgd + self.weight_decouple = weight_decouple + self.rectify = rectify + self.fixed_decay = fixed_decay + if self.weight_decouple: + print('Weight decoupling enabled in AdaBelief') + if self.fixed_decay: + print('Weight decay fixed') + if self.rectify: + print('Rectification enabled in AdaBelief') + if amsgrad: + print('AMSGrad enabled in AdaBelief') + + def __setstate__(self, state): + super(AdaBelief, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def reset(self): + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + amsgrad = group['amsgrad'] + + # State initialization + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'AdaBelief does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + beta1, beta2 = group['betas'] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p.data,memory_format=torch.preserve_format) \ + if version_higher else torch.zeros_like(p.data) + + # get current state variable + exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # Update first and second moment running average + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + grad_residual = grad - exp_avg + exp_avg_var.mul_(beta2).addcmul_( grad_residual, grad_residual, value=1 - beta2) + + if amsgrad: + max_exp_avg_var = state['max_exp_avg_var'] + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var) + + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + else: + denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + + # perform weight decay, check if decoupled weight decay + if self.weight_decouple: + if not self.fixed_decay: + p.data.mul_(1.0 - group['lr'] * group['weight_decay']) + else: + p.data.mul_(1.0 - group['weight_decay']) + else: + if group['weight_decay'] != 0: + grad.add_(p.data, alpha=group['weight_decay']) + + # update + if not self.rectify: + # Default update + step_size = group['lr'] / bias_correction1 + p.data.addcdiv_( exp_avg, denom, value=-step_size) + + else: # Rectified update, forked from RAdam + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state['step']) + else: + step_size = -1 + buffered[2] = step_size + + if N_sma >= 5: + denom = exp_avg_var.sqrt().add_(group['eps']) + p.data.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) + elif step_size > 0: + p.data.add_( exp_avg, alpha=-step_size * group['lr']) + + return loss \ No newline at end of file diff --git a/optimizers/over9000/adamod.py b/optimizers/over9000/adamod.py new file mode 100644 index 0000000..b345560 --- /dev/null +++ b/optimizers/over9000/adamod.py @@ -0,0 +1,98 @@ +# original repo https://github.com/lancopku/AdaMod/blob/master/adamod/adamod.py +import math +import torch +from torch.optim import Optimizer + +class AdaMod(Optimizer): + """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) + It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_. + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), beta3=0.999, + eps=1e-8, weight_decay=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= beta3 < 1.0: + raise ValueError("Invalid beta3 parameter: {}".format(beta3)) + defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps, + weight_decay=weight_decay) + super(AdaMod, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdaMod, self).__setstate__(state) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'AdaMod does not support sparse gradients') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + # Exponential moving average of actual learning rates + state['exp_avg_lr'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p.data.add_(-group['weight_decay'] * group['lr'], p.data) + + # Applies momental bounds on actual learning rates + step_size = torch.full_like(denom, step_size) + step_size.div_(denom) + exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size) + step_size = torch.min(step_size, exp_avg_lr) + step_size.mul_(exp_avg) + + p.data.add_(-step_size) + + return loss diff --git a/optimizers/over9000/adan.py b/optimizers/over9000/adan.py new file mode 100644 index 0000000..8206a92 --- /dev/null +++ b/optimizers/over9000/adan.py @@ -0,0 +1,156 @@ +# https://github.com/raw/sail-sg/Adan/main/adan.py + +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer +from timm.utils import * + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/optimizers/over9000/apollo.py b/optimizers/over9000/apollo.py new file mode 100644 index 0000000..2dd8656 --- /dev/null +++ b/optimizers/over9000/apollo.py @@ -0,0 +1,115 @@ +# from https://github.com/raw/XuezheMax/apollo/master/optim/apollo.py +import torch +from torch.optim.optimizer import Optimizer + + +class Apollo(Optimizer): + r"""Implements Atom algorithm. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float): learning rate + beta (float, optional): coefficient used for computing + running averages of gradient (default: 0.9) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-4) + warmup (int, optional): number of warmup steps (default: 0) + init_lr (float, optional): initial learning rate for warmup (default: 0.01) + weight_decay (float, optional): weight decay coefficient (default: 0) + + """ + + def __init__(self, params, lr, beta=0.9, eps=1e-4, warmup=100, init_lr=0.01, weight_decay=0): + if not 0.0 < lr: + raise ValueError("Invalid learning rate value: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= beta < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(beta)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= warmup: + raise ValueError("Invalid warmup updates: {}".format(warmup)) + if not 0.0 <= init_lr <= 1.0: + raise ValueError("Invalid initial learning rate: {}".format(init_lr)) + + defaults = dict(lr=lr, beta=beta, eps=eps, warmup=warmup, + init_lr=init_lr, base_lr=lr, weight_decay=weight_decay) + super(Apollo, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Apollo, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg_grad'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['approx_hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Previous update direction + state['update'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + # Calculate current lr + if state['step'] < group['warmup']: + curr_lr = (group['base_lr'] - group['init_lr']) * state['step'] / group['warmup'] + group['init_lr'] + else: + curr_lr = group['lr'] + + # Perform optimization step + grad = p.grad + if grad.is_sparse: + raise RuntimeError('Atom does not support sparse gradients.') + + # Perform step weight decay + if group['weight_decay'] != 0: + grad = grad.add(p, alpha=group['weight_decay']) + + beta = group['beta'] + exp_avg_grad = state['exp_avg_grad'] + B = state['approx_hessian'] + d_p = state['update'] + + state['step'] += 1 + bias_correction = 1 - beta ** state['step'] + alpha = (1 - beta) / bias_correction + + # Update the running average grad + delta_grad = grad - exp_avg_grad + exp_avg_grad.add_(delta_grad, alpha=alpha) + + denom = d_p.norm(p=4).add(group['eps']) + d_p.div_(denom) + v_sq = d_p.mul(d_p) + delta = delta_grad.div_(denom).mul_(d_p).sum().mul(-alpha) - B.mul(v_sq).sum() + + # Update B + B.addcmul_(v_sq, delta) + + # calc direction of parameter updates + denom = B.abs().clamp_(min=1) + d_p.copy_(exp_avg_grad.div(denom)) + + p.add_(d_p, alpha=-curr_lr) + + return loss diff --git a/optimizers/over9000/diffgrad.py b/optimizers/over9000/diffgrad.py new file mode 100644 index 0000000..8295de9 --- /dev/null +++ b/optimizers/over9000/diffgrad.py @@ -0,0 +1,127 @@ + +import math +import torch +from torch.optim.optimizer import Optimizer +import numpy as np +import torch.nn as nn + +# Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py + +# modifications: @lessw2020 +# https://github.com/lessw2020/Best-Deep-Learning-Optimizers/blob/master/diffgrad/diffgrad.py + +class DiffGrad(Optimizer): + r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. + It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + .. _diffGrad: An Optimization Method for Convolutional Neural Networks: + https://arxiv.org/abs/1909.11015 + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super().__init__(params, defaults) + + #save version + self.version = version + + def __setstate__(self, state): + super().__setstate__(state) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + # Previous gradient + state['previous_grad'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + if group['weight_decay'] != 0: + grad.add_(group['weight_decay'], p.data) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # compute diffgrad coefficient (dfc) + + + if self.version==0: + diff = abs(previous_grad - grad) + elif self.version ==1: + diff = previous_grad-grad + elif self.version ==2: + diff = .5*abs(previous_grad - grad) + + if self.version==0 or self.version==1: + dfc = 1. / (1. + torch.exp(-diff)) + elif self.version==2: + dfc = 9. / (1. + torch.exp(-diff))-4 #DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 + + state['previous_grad'] = grad + + # update momentum with dfc + exp_avg1 = exp_avg * dfc + + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg1, denom) + + return loss \ No newline at end of file diff --git a/optimizers/over9000/lamb.py b/optimizers/over9000/lamb.py new file mode 100644 index 0000000..67adb67 --- /dev/null +++ b/optimizers/over9000/lamb.py @@ -0,0 +1,129 @@ +"""Lamb optimizer.""" +# from https://github.com/cybertronai/pytorch-lamb/blob/master/pytorch_lamb/lamb.py + +import collections +import math + +import torch +from torch.optim import Optimizer + +try: + from tensorboardX import SummaryWriter + + def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int): + """Log a histogram of trust ratio scalars in across layers.""" + results = collections.defaultdict(list) + for group in optimizer.param_groups: + for p in group['params']: + state = optimizer.state[p] + for i in ('weight_norm', 'adam_norm', 'trust_ratio'): + if i in state: + results[i].append(state[i]) + + for k, v in results.items(): + event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count) +except ModuleNotFoundError as e: + print("To use this log_lamb_rs, please run 'pip install tensorboardx'. Also you must have Tensorboard running to see results") + +class Lamb(Optimizer): + r"""Implements Lamb algorithm. + + It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + adam (bool, optional): always use trust ratio = 1, which turns this into + Adam. Useful for comparison purposes. + + .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes: + https://arxiv.org/abs/1904.00962 + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, + weight_decay=0, adam=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay) + self.adam = adam + super(Lamb, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + # m_t + exp_avg.mul_(beta1).add_(1 - beta1, grad) + # v_t + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + # Paper v3 does not use debiasing. + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + # Apply bias to lr to avoid broadcast. + step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1 + + weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) + + adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps']) + if group['weight_decay'] != 0: + adam_step.add_(group['weight_decay'], p.data) + + adam_norm = adam_step.pow(2).sum().sqrt() + if weight_norm == 0 or adam_norm == 0: + trust_ratio = 1 + else: + trust_ratio = weight_norm / adam_norm + state['weight_norm'] = weight_norm + state['adam_norm'] = adam_norm + state['trust_ratio'] = trust_ratio + if self.adam: + trust_ratio = 1 + + p.data.add_(-step_size * trust_ratio, adam_step) + + return loss diff --git a/optimizers/over9000/lookahead.py b/optimizers/over9000/lookahead.py new file mode 100644 index 0000000..b54c518 --- /dev/null +++ b/optimizers/over9000/lookahead.py @@ -0,0 +1,103 @@ +from torch.optim import Adam +import torch +from torch.optim import Optimizer +from collections import defaultdict + +class Lookahead(Optimizer): + ''' + PyTorch implementation of the lookahead wrapper. + Lookahead Optimizer: https://arxiv.org/abs/1907.08610 + ''' + def __init__(self, optimizer,alpha=0.5, k=6,pullback_momentum="none"): + ''' + :param optimizer:inner optimizer + :param k (int): number of lookahead steps + :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer. + :param pullback_momentum (str): change to inner optimizer momentum on interpolation update + ''' + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + self.optimizer = optimizer + self.param_groups = self.optimizer.param_groups + self.alpha = alpha + self.k = k + self.step_counter = 0 + assert pullback_momentum in ["reset", "pullback", "none"] + self.pullback_momentum = pullback_momentum + self.state = defaultdict(dict) + + # Cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['cached_params'] = torch.zeros_like(p.data) + param_state['cached_params'].copy_(p.data) + + def __getstate__(self): + return { + 'state': self.state, + 'optimizer': self.optimizer, + 'alpha': self.alpha, + 'step_counter': self.step_counter, + 'k':self.k, + 'pullback_momentum': self.pullback_momentum + } + + def zero_grad(self): + self.optimizer.zero_grad() + + def state_dict(self): + return self.optimizer.state_dict() + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict) + + def _backup_and_load_cache(self): + """Useful for performing evaluation on the slow weights (which typically generalize better) + """ + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['backup_params'] = torch.zeros_like(p.data) + param_state['backup_params'].copy_(p.data) + p.data.copy_(param_state['cached_params']) + + def _clear_and_load_backup(self): + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.copy_(param_state['backup_params']) + del param_state['backup_params'] + + def step(self, closure=None): + """Performs a single Lookahead optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = self.optimizer.step(closure) + self.step_counter += 1 + + if self.step_counter >= self.k: + self.step_counter = 0 + # Lookahead and cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params']) # crucial line + param_state['cached_params'].copy_(p.data) + if self.pullback_momentum == "pullback": + internal_momentum = self.optimizer.state[p]["momentum_buffer"] + self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_( + 1.0 - self.alpha, param_state["cached_mom"]) + param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] + elif self.pullback_momentum == "reset": + self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) + + return loss + +def LookaheadAdam(params, alpha=0.5, k=6, *args, **kwargs): + adam = Adam(params, *args, **kwargs) + return Lookahead(adam, alpha, k) \ No newline at end of file diff --git a/optimizers/over9000/madam.py b/optimizers/over9000/madam.py new file mode 100644 index 0000000..4212701 --- /dev/null +++ b/optimizers/over9000/madam.py @@ -0,0 +1,47 @@ +# from here https://github.com/jxbz/madam/blob/master/pytorch/optim/madam.py +import torch +from torch.optim.optimizer import Optimizer, required + + +class Madam(Optimizer): + + def __init__(self, params, lr=0.01, p_scale=3.0, g_bound=10.0): + + self.p_scale = p_scale + self.g_bound = g_bound + defaults = dict(lr=lr) + super(Madam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['max'] = self.p_scale*(p*p).mean().sqrt().item() + state['step'] = 0 + state['exp_avg_sq'] = torch.zeros_like(p) + + state['step'] += 1 + bias_correction = 1 - 0.999 ** state['step'] + state['exp_avg_sq'] = 0.999 * state['exp_avg_sq'] + 0.001 * p.grad.data**2 + + g_normed = p.grad.data / (state['exp_avg_sq']/bias_correction).sqrt() + g_normed[torch.isnan(g_normed)] = 0 + g_normed.clamp_(-self.g_bound, self.g_bound) + + p.data *= torch.exp( -group['lr']*g_normed*torch.sign(p.data) ) + p.data.clamp_(-state['max'], state['max']) + + return loss diff --git a/optimizers/over9000/madgrad.py b/optimizers/over9000/madgrad.py new file mode 100644 index 0000000..bcbc90e --- /dev/null +++ b/optimizers/over9000/madgrad.py @@ -0,0 +1,174 @@ +# from https://github.com/raw/facebookresearch/madgrad/master/madgrad/madgrad.py +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import TYPE_CHECKING, Any, Callable, Optional + +import torch +import torch.optim + +if TYPE_CHECKING: + from torch.optim.optimizer import _params_t +else: + _params_t = Any + + +class MADGRAD(torch.optim.Optimizer): + """ + MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic + Optimization. + + .. _MADGRAD: https://arxiv.org/abs/2101.11075 + + MADGRAD is a general purpose optimizer that can be used in place of SGD or + Adam may converge faster and generalize better. Currently GPU-only. + Typically, the same learning rate schedule that is used for SGD or Adam may + be used. The overall learning rate is not comparable to either method and + should be determined by a hyper-parameter sweep. + + MADGRAD requires less weight decay than other methods, often as little as + zero. Momentum values used for SGD or Adam's beta1 should work here also. + + On sparse problems both weight_decay and momentum should be set to 0. + + Arguments: + params (iterable): + Iterable of parameters to optimize or dicts defining parameter groups. + lr (float): + Learning rate (default: 1e-2). + momentum (float): + Momentum value in the range [0,1) (default: 0.9). + weight_decay (float): + Weight decay, i.e. a L2 penalty (default: 0). + eps (float): + Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). + """ + + def __init__( + self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6, + ): + if momentum < 0 or momentum >= 1: + raise ValueError(f"Momentum {momentum} must be in the range [0,1]") + if lr <= 0: + raise ValueError(f"Learning rate {lr} must be positive") + if weight_decay < 0: + raise ValueError(f"Weight decay {weight_decay} must be non-negative") + if eps < 0: + raise ValueError(f"Eps must be non-negative") + + defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, k=0) + super().__init__(params, defaults) + + for group in self.param_groups: + for p in group["params"]: + state = self.state[p] + + state["grad_sum_sq"] = torch.zeros_like(p.data).detach() + state["s"] = torch.zeros_like(p.data).detach() + if momentum != 0: + state["x0"] = torch.clone(p.data).detach() + + @property + def supports_memory_efficient_fp16(self) -> bool: + return False + + @property + def supports_flat_params(self) -> bool: + return True + + def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + eps = group["eps"] + k = group["k"] + lr = group["lr"] + eps + decay = group["weight_decay"] + momentum = group["momentum"] + + ck = 1 - momentum + lamb = lr * math.pow(k + 1, 0.5) + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data + state = self.state[p] + + if momentum != 0.0 and grad.is_sparse: + raise RuntimeError("momentum != 0 is not compatible with sparse gradients") + + grad_sum_sq = state["grad_sum_sq"] + s = state["s"] + + # Apply weight decay + if decay != 0: + if grad.is_sparse: + raise RuntimeError("weight_decay option is not compatible with sparse gradients") + + grad.add_(p.data, alpha=decay) + + if grad.is_sparse: + grad = grad.coalesce() + grad_val = grad._values() + + p_masked = p.sparse_mask(grad) + grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) + s_masked = s.sparse_mask(grad) + + # Compute x_0 from other known quantities + rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) + x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1) + + # Dense + sparse op + grad_sq = grad * grad + grad_sum_sq.add_(grad_sq, alpha=lamb) + grad_sum_sq_masked.add_(grad_sq, alpha=lamb) + + rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) + + s.add_(grad, alpha=lamb) + s_masked._values().add_(grad_val, alpha=lamb) + + # update masked copy of p + p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1) + # Copy updated masked p to dense p using an add operation + p_masked._values().add_(p_kp1_masked_vals, alpha=-1) + p.data.add_(p_masked, alpha=-1) + else: + if momentum == 0: + # Compute x_0 from other known quantities + rms = grad_sum_sq.pow(1 / 3).add_(eps) + x0 = p.data.addcdiv(s, rms, value=1) + else: + x0 = state["x0"] + + # Accumulate second moments + grad_sum_sq.addcmul_(grad, grad, value=lamb) + rms = grad_sum_sq.pow(1 / 3).add_(eps) + + # Update s + s.data.add_(grad, alpha=lamb) + + # Step + if momentum == 0: + p.data.copy_(x0.addcdiv(s, rms, value=-1)) + else: + z = x0.addcdiv(s, rms, value=-1) + + # p is a moving average of z + p.data.mul_(1 - ck).add_(z, alpha=ck) + + group["k"] = group["k"] + 1 + return loss diff --git a/optimizers/over9000/novograd.py b/optimizers/over9000/novograd.py new file mode 100644 index 0000000..5eea065 --- /dev/null +++ b/optimizers/over9000/novograd.py @@ -0,0 +1,223 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch.optim import Optimizer +import math + +class AdamW(Optimizer): + """Implements AdamW algorithm. + + It has been proposed in `Adam: A Method for Stochastic Optimization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + + Adam: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + p.data.add_(-step_size, torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) ) + + return loss + +class Novograd(Optimizer): + """ + Implements Novograd algorithm. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.95, 0)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging: gradient averaging + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8, + weight_decay=0, grad_averaging=False, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + grad_averaging=grad_averaging, + amsgrad=amsgrad) + + super(Novograd, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Novograd, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Sparse gradients are not supported.') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + norm = torch.sum(torch.pow(grad, 2)) + + if exp_avg_sq == 0: + exp_avg_sq.copy_(norm) + else: + exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + grad.div_(denom) + if group['weight_decay'] != 0: + grad.add_(group['weight_decay'], p.data) + if group['grad_averaging']: + grad.mul_(1 - beta1) + exp_avg.mul_(beta1).add_(grad) + + p.data.add_(-group['lr'], exp_avg) + + return loss \ No newline at end of file diff --git a/optimizers/over9000/radam.py b/optimizers/over9000/radam.py new file mode 100644 index 0000000..74d7e0c --- /dev/null +++ b/optimizers/over9000/radam.py @@ -0,0 +1,209 @@ +# from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py + +import math +import torch +from torch.optim.optimizer import Optimizer, required + +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + step_size = 1.0 / (1 - beta1 ** state['step']) + buffered[2] = step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) + else: + p_data_fp32.add_(-step_size * group['lr'], exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + +class PlainRAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + step_size = group['lr'] / (1 - beta1 ** state['step']) + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, warmup = warmup) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + if group['warmup'] > state['step']: + scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] + else: + scheduled_lr = group['lr'] + + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/optimizers/over9000/ralamb.py b/optimizers/over9000/ralamb.py new file mode 100644 index 0000000..f7036c8 --- /dev/null +++ b/optimizers/over9000/ralamb.py @@ -0,0 +1,99 @@ +import torch, math +from torch.optim.optimizer import Optimizer + +# RAdam + LARS +class Ralamb(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(Ralamb, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Ralamb, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Ralamb does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + # Decay the first and second moment running average coefficient + # m_t + exp_avg.mul_(beta1).add_(1 - beta1, grad) + # v_t + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + + if state['step'] == buffered[0]: + N_sma, radam_step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + radam_step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + radam_step_size = 1.0 / (1 - beta1 ** state['step']) + buffered[2] = radam_step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + radam_step = p_data_fp32.clone() + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + radam_step.addcdiv_(-radam_step_size * group['lr'], exp_avg, denom) + else: + radam_step.add_(-radam_step_size * group['lr'], exp_avg) + + radam_norm = radam_step.pow(2).sum().sqrt() + weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) + if weight_norm == 0 or radam_norm == 0: + trust_ratio = 1 + else: + trust_ratio = weight_norm / radam_norm + + state['weight_norm'] = weight_norm + state['adam_norm'] = radam_norm + state['trust_ratio'] = trust_ratio + + if N_sma >= 5: + p_data_fp32.addcdiv_(-radam_step_size * group['lr'] * trust_ratio, exp_avg, denom) + else: + p_data_fp32.add_(-radam_step_size * group['lr'] * trust_ratio, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/optimizers/over9000/ranger.py b/optimizers/over9000/ranger.py new file mode 100644 index 0000000..b6c8218 --- /dev/null +++ b/optimizers/over9000/ranger.py @@ -0,0 +1,12 @@ + +import math +import torch +from torch.optim.optimizer import Optimizer, required +import itertools as it +from .lookahead import * +from .radam import * + +def Ranger(params, alpha=0.5, k=6, *args, **kwargs): + radam = RAdam(params, *args, **kwargs) + return Lookahead(radam, alpha, k) + diff --git a/optimizers/over9000/rangerlars.py b/optimizers/over9000/rangerlars.py new file mode 100644 index 0000000..3fcb39e --- /dev/null +++ b/optimizers/over9000/rangerlars.py @@ -0,0 +1,14 @@ +import torch, math +from torch.optim.optimizer import Optimizer +import itertools as it +from .lookahead import * +from .ralamb import * + +# RAdam + LARS + LookAHead + +# Lookahead implementation from https://github.com/lonePatient/lookahead_pytorch/blob/master/optimizer.py +# RAdam + LARS implementation from https://gist.github.com/redknightlois/c4023d393eb8f92bb44b2ab582d7ec20 + +def RangerLars(params, alpha=0.5, k=6, *args, **kwargs): + ralamb = Ralamb(params, *args, **kwargs) + return Lookahead(ralamb, alpha, k) From 2f547eda497d04bc044df31376d860f1557561aa Mon Sep 17 00:00:00 2001 From: Nour shoaib <82571628+nourshoaib@users.noreply.github.com> Date: Mon, 25 Dec 2023 13:51:57 +0200 Subject: [PATCH 05/20] Add files via upload --- visualization/visualize.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 visualization/visualize.py diff --git a/visualization/visualize.py b/visualization/visualize.py new file mode 100644 index 0000000..e69de29 From 5f0179de535a5f59f0c1d3755bc7e775a810ea31 Mon Sep 17 00:00:00 2001 From: Nour shoaib <82571628+nourshoaib@users.noreply.github.com> Date: Tue, 26 Dec 2023 08:23:22 +0200 Subject: [PATCH 06/20] Update visualize.py --- visualization/visualize.py | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/visualization/visualize.py b/visualization/visualize.py index e69de29..3df1c68 100644 --- a/visualization/visualize.py +++ b/visualization/visualize.py @@ -0,0 +1,54 @@ +class Visualization(): + def mask2rgb(mask,max_value=1.0): + shape = mask.shape + if len(shape) == 2: + mask = mask[:,:,np.newaxis] + h,w,c = mask.shape + if c == 3: + return mask + if c == 4: + return mask[:,:,:3] + + if c > 4: + raise ValueError + + padded = np.zeros((h,w,3),dtype=mask.dtype) + padded[:,:,:c] = mask + padded = (padded * max_value).astype(np.uint8) + + return padded + + + def make_rgb_mask(mask,color=(255,0,0)): + h,w = mask.shape[:2] + rgb = np.zeros((h,w,3),dtype=np.uint8) + rgb[mask == 1.0,:] = color + return rgb + + def overlay_rgb_mask(img,mask,sel,alpha): + + sel = sel == 1.0 + img[sel,:] = img[sel,:] * (1.0 - alpha) + mask[sel,:] * alpha + return img + + def overlay_instances_mask(img,instances,cmap,alpha=0.9): + h,w = img.shape[:2] + overlay = np.zeros((h,w,3),dtype=np.float32) + + _max = instances.max() + _cmax = cmap.shape[0] + + + if _max == 0: + return img + elif _max > _cmax: + indexes = [(i % _cmax) for i in range(_max)] + else: + indexes = random.sample(range(0,_cmax),_max) + + for i,idx in enumerate(indexes): + overlay[instances == i+1,:] = cmap[idx,:] + + overlay = (overlay * 255.0).astype(np.uint8) + viz = overlay_rgb_mask(img,overlay,instances>0,alpha=alpha) + return viz From af141700b55b0096bc21a900211667d76b3dd9f0 Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:37:45 +0200 Subject: [PATCH 07/20] added scheduler tests --- schedulers/test.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 schedulers/test.py diff --git a/schedulers/test.py b/schedulers/test.py new file mode 100644 index 0000000..b9ce96c --- /dev/null +++ b/schedulers/test.py @@ -0,0 +1,17 @@ +import unittest +import torch +from torch import nn +from optimizers import get_optimizer +from schedulers import AutoScheduler,scheduler_mapping,get_scheduler +import numpy as np + + +class test_optimizers(unittest.TestCase): + def test_keys_lower_case(self): + for schechduler in scheduler_mapping: + self.assertEqual(schechduler,schechduler.lower()) + + + +if __name__=="__main__": + unittest.main() From 7ef439da0ed281f384005fdfa25f1cb24f69c69e Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:39:52 +0200 Subject: [PATCH 08/20] Update test.py to follow naming convention --- schedulers/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schedulers/test.py b/schedulers/test.py index b9ce96c..03360ad 100644 --- a/schedulers/test.py +++ b/schedulers/test.py @@ -6,7 +6,7 @@ import numpy as np -class test_optimizers(unittest.TestCase): +class testScheduler(unittest.TestCase): def test_keys_lower_case(self): for schechduler in scheduler_mapping: self.assertEqual(schechduler,schechduler.lower()) From 3eb66ddb893cb67663b0a72938d11e94b04992c0 Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:42:29 +0200 Subject: [PATCH 09/20] Update test_optimers.py fixed the optimizer test case --- optimizers/test_optimers.py | 51 ++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/optimizers/test_optimers.py b/optimizers/test_optimers.py index a2176f3..5f6c783 100644 --- a/optimizers/test_optimers.py +++ b/optimizers/test_optimers.py @@ -1,15 +1,52 @@ import unittest import torch +from torch import nn +from . import get_optimizer,optimizer_mapping +import numpy as np -from __init__ import get_optimizer,optimizer_mapping - -class test_optimizers(unittest.TestCase): +class testOptimizers(unittest.TestCase): def test_getoptimzer_return(self): for optimizer in optimizer_mapping: - net=torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True) - self.assertIs(get_optimizer(optimizer,net.parameters()),torch.optim.Optimizer,f"{optimizer} did not return an optimizer") - + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + optim.zero_grad() + optim.step() + self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an optimizer") + + def test_keys_lower_case(self): + for optim in optimizer_mapping: + self.assertEqual(optim,optim.lower()) + + def test_getoptimzer_functionality(self): + for optimizer in optimizer_mapping: + for __ in range(5): + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + loss1=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss1.append(loss.item()) + optim.step() + loss2=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss2.append(loss.item()) + optim.step() + self.assertGreaterEqual(np.array(loss1).mean(),np.array(loss2).mean(),f"{optimizer} is not functioning as an optimizer") if __name__=="__main__": - test_optimizers().main() + unittest.main() + From 8247a0344d91be267b137a23df911faaea5f04d0 Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Wed, 27 Dec 2023 19:13:13 +0200 Subject: [PATCH 10/20] 2021 code by Hasan --- ToBeChecked/DemoTransformation_py.py | 83 ++++++++++++++++ ToBeChecked/more_transormations.py | 86 ++++++++++++++++ ToBeChecked/utils.py | 141 +++++++++++++++++++++++++++ 3 files changed, 310 insertions(+) create mode 100644 ToBeChecked/DemoTransformation_py.py create mode 100644 ToBeChecked/more_transormations.py create mode 100644 ToBeChecked/utils.py diff --git a/ToBeChecked/DemoTransformation_py.py b/ToBeChecked/DemoTransformation_py.py new file mode 100644 index 0000000..a38a753 --- /dev/null +++ b/ToBeChecked/DemoTransformation_py.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 4 12:31:21 2021 + +@author: hasan +""" +import geopandas as gpd +import os +from shapely.geometry import Polygon,MultiPolygon +from utils import pix_to_utm, utm_to_pix, _tf, parse_tfw,tf_upper,tf_utm +import matplotlib.pyplot as plt +from skimage.io import imread + +tfw_map = { + 'LEBANON_2013_50CM_RGB2_7' : 'LEBANON_2013_50CM_NRGB2_7.tfw', + 'LEBANON_2013_50CM_RGB3_5' : 'LEBANON_2013_50CM_NRGB3_5.tfw' + } + +tfws_path = '../tfw_files' +shape_path = '../pred_shapefile/pred_csv.shp' +imgs_path = '../visualized_results_inference' + +Image_ID = 'LEBANON_2013_50CM_RGB2_7_31744_27648' +img_path = os.path.join(imgs_path,Image_ID,f'{Image_ID}_img.png') +img = imread(img_path) + +print('----------------------------------------------------------------------') +#load the ShapeFile as a geo-DataFrame +gdf = gpd.read_file(shape_path) + +#get rows corresponding to building polygon of the selected Image_ID +gdf_sample = gdf.loc[gdf['ImageId'] == Image_ID] +gdf_sample.reset_index(inplace = True,drop = True) + +pixel_polys = gdf_sample['geometry'] +n_polys = len(pixel_polys) +print(gdf_sample) +print(f'There are {n_polys} polygons(building footprints) in Image {Image_ID}') +print('----------------------------------------------------------------------') + +# Now Get the x_offset and y_offset from the Image_ID +split_iid = Image_ID.split('_') +x_offset , y_offset = int(split_iid[-1]), int(split_iid[-2]) + +print(f'For Image {Image_ID} : \nX_offset : {x_offset}\nY_offset : {y_offset}') +print('----------------------------------------------------------------------') + +#Now get the corresponf .TFW file path +Image_Tiff_ID = '_'.join(split_iid[:-2]) +TFW_ID = tfw_map[Image_Tiff_ID] +tfw_path = os.path.join(tfws_path,TFW_ID) + +print(f'The Corresponding .tfw File for {Image_ID} is : {TFW_ID}') +# parse the Georeferencing Parameters +A,B,C,D,E,F = parse_tfw(tfw_path) + +#Add Offset to pixel_polys +pixel_poly_coordinates = [] +for p in pixel_polys: + if(p.geom_type == 'Polygon'): + coords = list(p.exterior.coords) + pixel_poly_coordinates.append(coords) + else: + #ignore multipolygons for now + pass +n_polys = len(pixel_poly_coordinates) +xs = [x_offset] * n_polys +ys = [y_offset] * n_polys + +offset_polys = tf_upper(pixel_poly_coordinates,xs,ys) + +#Now Transform To GeoReferenced Coordinates +georeferenced_polys = tf_utm(offset_polys,pix_to_utm,A,B,C,D,E,F) +#And Put them In a DataFrame +gdf_referenced = gpd.GeoDataFrame({'geometry' : georeferenced_polys}) + + +fig,axs = plt.subplots(1,2,figsize = (10,10)) +axs[0].imshow(img) +gdf_referenced.plot(ax = axs[1]) + + diff --git a/ToBeChecked/more_transormations.py b/ToBeChecked/more_transormations.py new file mode 100644 index 0000000..c9da895 --- /dev/null +++ b/ToBeChecked/more_transormations.py @@ -0,0 +1,86 @@ +from shapely.geometry import Polygon,MultiPolygon +import types +​ +def pix_to_utm(crd,A,B,C,D,E,F): + #convert a single pixel's (x,y) coordinates into the georeferenced coordinates + #A,B,C,D,E,F are parameters that can be read from the TFW file ... + # associated with the TIFF File that The Tile was taken from + x,y=crd + return x*A+y*B+C,x*D+y*E+F +​ +def utm_to_pix(crd,A,B,C,D,E,F): + #convert a single pixel's georeferenced coords (lon,lat) coordinates into the pixel's coords + x1,y1 = crd + return (E * x1 + B * y1 + B * F - E * C)/(A * E - D * B),(-D * x1 + A * y1 + D * C - A * F )/(A *E - D * B) +​ +def tf(func,poly,A,B,C,D,E,F): + #apply a transformation to a list of coordinates + #transformation options ( pix_to_utm & utm_to_pix) + k=[] + for tup in poly: + k.append(func(tup,A,B,C,D,E,F)) + return k +​ +def tf_polygon(func,poly,A,B,C,D,E,F): + #apply transformation to shapely.geometry polygon + k = tf(func,list(poly.exterior.coords),A,B,C,D,E,F) + return Polygon(k) +​ +def tf_multipoly(func,mpoly,A,B,C,D,E,F): + #apply transformation to shapely.geometry multi-polygon + k=[] + for poly in list(mpoly): + k.append(tf_polygon(func,poly,A,B,C,D,E,F)) + return MultiPolygon(k) +​ +def _tf(func,coords,A,B,C,D,E,F): + try: + typ = coords.geom_type + except AttributeError: + if(type(coords) is list): + typ = 'List' + else: + typ='ínvalid' + except: + typ='ínvalid' + if(typ == 'ínvalid'): + raise AttributeError('INVALID TYPE FOR COORDS') + elif(typ == 'MultiPolygon'): + return tf_multipoly(func,coords,A,B,C,D,E,F) + elif(typ == 'Polygon'): + return tf_polygon(func,coords,A,B,C,D,E,F) + else: + return tf(func,coords,A,B,C,D,E,F) + +def parse_tfw(path): + #given the path of the .TFW File, this function returs A,B,C,D,E,F params ... + #needed for the conversion between pixel and georeferenced coordinates + with open(path,'r') as lol: + a=lol.read() + a=a.split('\n') + permutation = [0,2,4,1,3,5] + #return [float(a[i].strip(' ')) for i in permutation] + #or + return list(map(lambda i : float(a[i].strip(' ')),permutation)) +​ +def tf_upper(polys,x,y): + #given a list of polygons + #and a list of x & y offsets from top left corner + #this function adds the offsets to the polygon's pixel coords + ipolys=[] + for i,poly in enumerate(polys): + k=[] + for tup in poly: + k.append((x[i]+tup[0],y[i]+tup[1])) + ipolys.append(k) + npolys =[Polygon(poly) for poly in ipolys] + return npolys + +def tf_utm(polys,func,A,B,C,D,E,F): + #transform all the polys in a list from pixel to georeferenced coordinates + #the input polygon should have pixel coordinates w.r.t top-left corner + #( i.e : the offsets should be added using previous function (tf_upper)) + ipolys=[] + for poly in polys: + ipolys.append(_tf(func,poly,A,B,C,D,E,F)) + return ipolys \ No newline at end of file diff --git a/ToBeChecked/utils.py b/ToBeChecked/utils.py new file mode 100644 index 0000000..ca59ce7 --- /dev/null +++ b/ToBeChecked/utils.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 8 17:56:35 2021 + +@author: hasan +""" +#useful functions :D +import geopandas as gpd +import rasterio as rio +from shapely.geometry import Polygon,box +from mercantile import bounds +from supermercado.burntiles import burn +from tqdm import tqdm,trange +from math import ceil + +global our_crs +our_crs = 'WGS84' + +def get_fitting_box_from_box(bbox,width,height): + minx,miny,maxx,maxy = bbox.bounds + cx = (minx+maxx) // 2 + cy = (miny+maxy) // 2 + + box_w = maxx - minx + box_h = maxy - miny + + fbox_w = ceil(box_w / 512) * 512 + fbox_h = ceil(box_h / 512) * 512 + + gap_left = fbox_w // 2 - (cx - max(0,cx - fbox_w // 2)) + gap_right = fbox_w // 2 - (min(width,cx + fbox_w // 2) - cx) + gap_up = fbox_h // 2 - (cy - max(0,cy - fbox_h // 2)) + gap_down = fbox_h // 2 - (min(height,cy + fbox_h // 2) - cy) + + fb_minx = cx - (fbox_w // 2 + gap_right - gap_left) + fb_maxx = cx + (fbox_w // 2 + gap_left - gap_right) + fb_miny = cy - (fbox_h // 2 + gap_down - gap_up) + fb_maxy = cy + (fbox_h // 2 + gap_up - gap_down) + + fbox = box(fb_minx,fb_miny,fb_maxx,fb_maxy) + return fbox + +def poly_lonlat2pix(poly,bbox_bounds,img,width = None,height = None): + if(img is not None): + h,w = img.shape[:2] + elif(None not in set([width,height])): + h,w = height,width + else: + raise ValueError('Either Image or height and width should not be None') + transform = rio.transform.from_bounds(*bbox_bounds,w,h) + xs,ys = poly.exterior.xy + rows,cols = rio.transform.rowcol(transform,xs,ys) + coords = list(zip(cols,rows)) + return Polygon(coords) + +def get_tiles_xyz(gdf,zoom_level): + gdf_geo = gdf.__geo_interface__ + features = gdf_geo['features'] + tiles_xyz = burn(features,zoom_level) + return tiles_xyz + +def get_tiles_xyz_fast(gdf,zoom_level): + part =100000 + l = len(gdf) + all_tiles = set() + if(l Date: Wed, 27 Dec 2023 23:30:46 +0200 Subject: [PATCH 11/20] Add files via upload --- data_processing/splitting.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/data_processing/splitting.py b/data_processing/splitting.py index 3a441fa..3c4c11f 100644 --- a/data_processing/splitting.py +++ b/data_processing/splitting.py @@ -2,13 +2,39 @@ def split_K_stratified_folds( df, - nfolds, - seed, - id_key, - split_key, - label_keys, + id_key="id", + split_key="class", + label_keys="label", + nfolds=5, + seed=313, verbose=False ): + """ + split a given dataframe into a K startified folds (equal ditribution for classes in each split) + + + @param df: dataframe to split + @param id_key: the id column key in df + @param split_key : the key based for the split + @param label_keys : the label class + @param nfolds : nunber of folds + @param seed : random seed + @param verbose : enable to print the procedure + + + @type df: Dataframe + @type id_key: str + @type split_key : str + @type label_keys : str + @type nfolds : int + @type seed : int + @type verbose : bool + + This function split a dataframe using the StratifiedKFold of sci-kit learn library. + it is used to train the model compare multiple learning techniques. + + Note: stratified K split is not always the optimal split, sometimes choosing the normal ksplit is better (such as with ensambles) + """ X = df.groupby(id_key)[split_key].first().index.values y = df.groupby(id_key)[split_key].first().values skf = StratifiedKFold(n_splits = nfolds, random_state = seed, shuffle=True) From 5d3d19c3f7f98f4189f6cf76322675cf35ca18bd Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Wed, 27 Dec 2023 23:32:20 +0200 Subject: [PATCH 12/20] added the lookahead feature todo: fix the "ADAN" lookahead --- optimizers/__init__.py | 6 +++- optimizers/test.py | 68 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 optimizers/test.py diff --git a/optimizers/__init__.py b/optimizers/__init__.py index 974f8cc..063118d 100644 --- a/optimizers/__init__.py +++ b/optimizers/__init__.py @@ -29,7 +29,7 @@ } -def get_optimizer(name:str ,params,*args,**kwargs) -> Optimizer: +def get_optimizer(name:str ,params,lookAhead=False,lookAhead_alpha=0.5, lookAhead_k=6,lookAhead_pullback_momentum="none",*args,**kwargs) -> Optimizer: """ This function returns the optimizer given its name @@ -46,4 +46,8 @@ def get_optimizer(name:str ,params,*args,**kwargs) -> Optimizer: name = name.lower() if name not in optimizer_mapping.keys(): raise ValueError('Optimizer {} not an option'.format(name)) + + if lookAhead: + assert name != 'adan', "lookahead adan is not supported" + return Lookahead(optimizer_mapping[name](params,*args,**kwargs),alpha=lookAhead_alpha, k=lookAhead_k,pullback_momentum=lookAhead_pullback_momentum) return optimizer_mapping[name](params,*args,**kwargs) \ No newline at end of file diff --git a/optimizers/test.py b/optimizers/test.py new file mode 100644 index 0000000..bcc566c --- /dev/null +++ b/optimizers/test.py @@ -0,0 +1,68 @@ +import unittest +import torch +from torch import nn +from . import get_optimizer,optimizer_mapping +import numpy as np + + +class testOptimizers(unittest.TestCase): + def test_getoptimzer_return(self): + for optimizer in optimizer_mapping: + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + optim.zero_grad() + optim.step() + self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") + + def test_getoptimzer_lookAhead(self): + for optimizer in optimizer_mapping: + net=nn.Sequential( + nn.Linear(2,1), + ) + if optimizer =="adan": #TODO fix adan + with self.assertRaises(AssertionError) as context: + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) + else: + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) + optim.zero_grad() + optim.step() + + self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") + + def test_keys_lower_case(self): + for optim in optimizer_mapping: + self.assertEqual(optim,optim.lower()) + + def test_getoptimzer_functionality(self): + for optimizer in optimizer_mapping: + for __ in range(5): + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + loss1=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss1.append(loss.item()) + optim.step() + loss2=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss2.append(loss.item()) + optim.step() + self.assertGreaterEqual(np.array(loss1).mean(),np.array(loss2).mean(),f"{optimizer} is not functioning as an optimizer") + + + +if __name__=="__main__": + unittest.main() From 2c9a3e7277190723ffacc3bf3d0cf6949b2389dd Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Thu, 28 Dec 2023 17:23:24 +0200 Subject: [PATCH 13/20] Delete more_transormations.py --- ToBeChecked/more_transormations.py | 86 ------------------------------ 1 file changed, 86 deletions(-) delete mode 100644 ToBeChecked/more_transormations.py diff --git a/ToBeChecked/more_transormations.py b/ToBeChecked/more_transormations.py deleted file mode 100644 index c9da895..0000000 --- a/ToBeChecked/more_transormations.py +++ /dev/null @@ -1,86 +0,0 @@ -from shapely.geometry import Polygon,MultiPolygon -import types -​ -def pix_to_utm(crd,A,B,C,D,E,F): - #convert a single pixel's (x,y) coordinates into the georeferenced coordinates - #A,B,C,D,E,F are parameters that can be read from the TFW file ... - # associated with the TIFF File that The Tile was taken from - x,y=crd - return x*A+y*B+C,x*D+y*E+F -​ -def utm_to_pix(crd,A,B,C,D,E,F): - #convert a single pixel's georeferenced coords (lon,lat) coordinates into the pixel's coords - x1,y1 = crd - return (E * x1 + B * y1 + B * F - E * C)/(A * E - D * B),(-D * x1 + A * y1 + D * C - A * F )/(A *E - D * B) -​ -def tf(func,poly,A,B,C,D,E,F): - #apply a transformation to a list of coordinates - #transformation options ( pix_to_utm & utm_to_pix) - k=[] - for tup in poly: - k.append(func(tup,A,B,C,D,E,F)) - return k -​ -def tf_polygon(func,poly,A,B,C,D,E,F): - #apply transformation to shapely.geometry polygon - k = tf(func,list(poly.exterior.coords),A,B,C,D,E,F) - return Polygon(k) -​ -def tf_multipoly(func,mpoly,A,B,C,D,E,F): - #apply transformation to shapely.geometry multi-polygon - k=[] - for poly in list(mpoly): - k.append(tf_polygon(func,poly,A,B,C,D,E,F)) - return MultiPolygon(k) -​ -def _tf(func,coords,A,B,C,D,E,F): - try: - typ = coords.geom_type - except AttributeError: - if(type(coords) is list): - typ = 'List' - else: - typ='ínvalid' - except: - typ='ínvalid' - if(typ == 'ínvalid'): - raise AttributeError('INVALID TYPE FOR COORDS') - elif(typ == 'MultiPolygon'): - return tf_multipoly(func,coords,A,B,C,D,E,F) - elif(typ == 'Polygon'): - return tf_polygon(func,coords,A,B,C,D,E,F) - else: - return tf(func,coords,A,B,C,D,E,F) - -def parse_tfw(path): - #given the path of the .TFW File, this function returs A,B,C,D,E,F params ... - #needed for the conversion between pixel and georeferenced coordinates - with open(path,'r') as lol: - a=lol.read() - a=a.split('\n') - permutation = [0,2,4,1,3,5] - #return [float(a[i].strip(' ')) for i in permutation] - #or - return list(map(lambda i : float(a[i].strip(' ')),permutation)) -​ -def tf_upper(polys,x,y): - #given a list of polygons - #and a list of x & y offsets from top left corner - #this function adds the offsets to the polygon's pixel coords - ipolys=[] - for i,poly in enumerate(polys): - k=[] - for tup in poly: - k.append((x[i]+tup[0],y[i]+tup[1])) - ipolys.append(k) - npolys =[Polygon(poly) for poly in ipolys] - return npolys - -def tf_utm(polys,func,A,B,C,D,E,F): - #transform all the polys in a list from pixel to georeferenced coordinates - #the input polygon should have pixel coordinates w.r.t top-left corner - #( i.e : the offsets should be added using previous function (tf_upper)) - ipolys=[] - for poly in polys: - ipolys.append(_tf(func,poly,A,B,C,D,E,F)) - return ipolys \ No newline at end of file From 4cf89b8d04a06cac46aaeb7a25dd5df7db6b899c Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Thu, 28 Dec 2023 17:26:36 +0200 Subject: [PATCH 14/20] renaming --- ToBeChecked/{utils.py => SolarPotentialutils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ToBeChecked/{utils.py => SolarPotentialutils.py} (100%) diff --git a/ToBeChecked/utils.py b/ToBeChecked/SolarPotentialutils.py similarity index 100% rename from ToBeChecked/utils.py rename to ToBeChecked/SolarPotentialutils.py From 859941883bcc655dc3d3b1c9872f009a605cef82 Mon Sep 17 00:00:00 2001 From: MhmdDimassi <42188224+MhmdDimassi@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:44:48 +0200 Subject: [PATCH 15/20] Revert "Add visualization file" --- visualization/visualize.py | 54 -------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 visualization/visualize.py diff --git a/visualization/visualize.py b/visualization/visualize.py deleted file mode 100644 index 3df1c68..0000000 --- a/visualization/visualize.py +++ /dev/null @@ -1,54 +0,0 @@ -class Visualization(): - def mask2rgb(mask,max_value=1.0): - shape = mask.shape - if len(shape) == 2: - mask = mask[:,:,np.newaxis] - h,w,c = mask.shape - if c == 3: - return mask - if c == 4: - return mask[:,:,:3] - - if c > 4: - raise ValueError - - padded = np.zeros((h,w,3),dtype=mask.dtype) - padded[:,:,:c] = mask - padded = (padded * max_value).astype(np.uint8) - - return padded - - - def make_rgb_mask(mask,color=(255,0,0)): - h,w = mask.shape[:2] - rgb = np.zeros((h,w,3),dtype=np.uint8) - rgb[mask == 1.0,:] = color - return rgb - - def overlay_rgb_mask(img,mask,sel,alpha): - - sel = sel == 1.0 - img[sel,:] = img[sel,:] * (1.0 - alpha) + mask[sel,:] * alpha - return img - - def overlay_instances_mask(img,instances,cmap,alpha=0.9): - h,w = img.shape[:2] - overlay = np.zeros((h,w,3),dtype=np.float32) - - _max = instances.max() - _cmax = cmap.shape[0] - - - if _max == 0: - return img - elif _max > _cmax: - indexes = [(i % _cmax) for i in range(_max)] - else: - indexes = random.sample(range(0,_cmax),_max) - - for i,idx in enumerate(indexes): - overlay[instances == i+1,:] = cmap[idx,:] - - overlay = (overlay * 255.0).astype(np.uint8) - viz = overlay_rgb_mask(img,overlay,instances>0,alpha=alpha) - return viz From 0452386bf3d58e7a1cd8995702fed9374bd1494f Mon Sep 17 00:00:00 2001 From: MhmdDimassi <42188224+MhmdDimassi@users.noreply.github.com> Date: Mon, 1 Jan 2024 16:23:31 +0200 Subject: [PATCH 16/20] Update test.py --- optimizers/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimizers/test.py b/optimizers/test.py index bcc566c..4c3392e 100644 --- a/optimizers/test.py +++ b/optimizers/test.py @@ -5,7 +5,7 @@ import numpy as np -class testOptimizers(unittest.TestCase): +class TestOptimizers(unittest.TestCase): def test_getoptimzer_return(self): for optimizer in optimizer_mapping: net=nn.Sequential( From 89099483dd3344ba6f8a6d1cb6cc0a8c07b1970c Mon Sep 17 00:00:00 2001 From: MohammadHasanZahweh <108551680+MohammadHasanZahweh@users.noreply.github.com> Date: Mon, 1 Jan 2024 16:23:46 +0200 Subject: [PATCH 17/20] Delete optimizers/test_optimers.py --- optimizers/test_optimers.py | 52 ------------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 optimizers/test_optimers.py diff --git a/optimizers/test_optimers.py b/optimizers/test_optimers.py deleted file mode 100644 index 5f6c783..0000000 --- a/optimizers/test_optimers.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest -import torch -from torch import nn -from . import get_optimizer,optimizer_mapping -import numpy as np - - -class testOptimizers(unittest.TestCase): - def test_getoptimzer_return(self): - for optimizer in optimizer_mapping: - net=nn.Sequential( - nn.Linear(2,1), - ) - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) - optim.zero_grad() - optim.step() - self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an optimizer") - - def test_keys_lower_case(self): - for optim in optimizer_mapping: - self.assertEqual(optim,optim.lower()) - - def test_getoptimzer_functionality(self): - for optimizer in optimizer_mapping: - for __ in range(5): - net=nn.Sequential( - nn.Linear(2,1), - ) - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) - loss1=[] - for _ in range(500): - optim.zero_grad() - X=torch.randint(0,10,(50,2),dtype=torch.float) - y=X.sum(axis=1) - loss=nn.MSELoss()(y,net(X)) - loss.backward() - loss1.append(loss.item()) - optim.step() - loss2=[] - for _ in range(500): - optim.zero_grad() - X=torch.randint(0,10,(50,2),dtype=torch.float) - y=X.sum(axis=1) - loss=nn.MSELoss()(y,net(X)) - loss.backward() - loss2.append(loss.item()) - optim.step() - self.assertGreaterEqual(np.array(loss1).mean(),np.array(loss2).mean(),f"{optimizer} is not functioning as an optimizer") - -if __name__=="__main__": - unittest.main() - From 973873bb8ed29f9abc745d4bb2000ce0556abcbc Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Mon, 1 Jan 2024 16:25:56 +0200 Subject: [PATCH 18/20] renaming tests by Zahweh --- optimizers/{test.py => test_optimizer.py} | 136 +++++++++++----------- schedulers/{test.py => test_scheduler.py} | 34 +++--- 2 files changed, 85 insertions(+), 85 deletions(-) rename optimizers/{test.py => test_optimizer.py} (97%) rename schedulers/{test.py => test_scheduler.py} (96%) diff --git a/optimizers/test.py b/optimizers/test_optimizer.py similarity index 97% rename from optimizers/test.py rename to optimizers/test_optimizer.py index 4c3392e..8dbe710 100644 --- a/optimizers/test.py +++ b/optimizers/test_optimizer.py @@ -1,68 +1,68 @@ -import unittest -import torch -from torch import nn -from . import get_optimizer,optimizer_mapping -import numpy as np - - -class TestOptimizers(unittest.TestCase): - def test_getoptimzer_return(self): - for optimizer in optimizer_mapping: - net=nn.Sequential( - nn.Linear(2,1), - ) - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) - optim.zero_grad() - optim.step() - self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") - - def test_getoptimzer_lookAhead(self): - for optimizer in optimizer_mapping: - net=nn.Sequential( - nn.Linear(2,1), - ) - if optimizer =="adan": #TODO fix adan - with self.assertRaises(AssertionError) as context: - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) - else: - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) - optim.zero_grad() - optim.step() - - self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") - - def test_keys_lower_case(self): - for optim in optimizer_mapping: - self.assertEqual(optim,optim.lower()) - - def test_getoptimzer_functionality(self): - for optimizer in optimizer_mapping: - for __ in range(5): - net=nn.Sequential( - nn.Linear(2,1), - ) - optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) - loss1=[] - for _ in range(500): - optim.zero_grad() - X=torch.randint(0,10,(50,2),dtype=torch.float) - y=X.sum(axis=1) - loss=nn.MSELoss()(y,net(X)) - loss.backward() - loss1.append(loss.item()) - optim.step() - loss2=[] - for _ in range(500): - optim.zero_grad() - X=torch.randint(0,10,(50,2),dtype=torch.float) - y=X.sum(axis=1) - loss=nn.MSELoss()(y,net(X)) - loss.backward() - loss2.append(loss.item()) - optim.step() - self.assertGreaterEqual(np.array(loss1).mean(),np.array(loss2).mean(),f"{optimizer} is not functioning as an optimizer") - - - -if __name__=="__main__": - unittest.main() +import unittest +import torch +from torch import nn +from . import get_optimizer,optimizer_mapping +import numpy as np + + +class TestOptimizers(unittest.TestCase): + def test_getoptimzer_return(self): + for optimizer in optimizer_mapping: + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + optim.zero_grad() + optim.step() + self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") + + def test_getoptimzer_lookAhead(self): + for optimizer in optimizer_mapping: + net=nn.Sequential( + nn.Linear(2,1), + ) + if optimizer =="adan": #TODO fix adan + with self.assertRaises(AssertionError) as context: + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) + else: + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3,lookAhead=True) + optim.zero_grad() + optim.step() + + self.assertIsInstance(optim,torch.optim.Optimizer,f"{optimizer} did not return an torch.optim.Optimizer") + + def test_keys_lower_case(self): + for optim in optimizer_mapping: + self.assertEqual(optim,optim.lower()) + + def test_getoptimzer_functionality(self): + for optimizer in optimizer_mapping: + for __ in range(5): + net=nn.Sequential( + nn.Linear(2,1), + ) + optim=get_optimizer(optimizer,net.parameters(),lr=1e-3) + loss1=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss1.append(loss.item()) + optim.step() + loss2=[] + for _ in range(500): + optim.zero_grad() + X=torch.randint(0,10,(50,2),dtype=torch.float) + y=X.sum(axis=1) + loss=nn.MSELoss()(y,net(X)) + loss.backward() + loss2.append(loss.item()) + optim.step() + self.assertGreaterEqual(np.array(loss1).mean(),np.array(loss2).mean(),f"{optimizer} is not functioning as an optimizer") + + + +if __name__=="__main__": + unittest.main() diff --git a/schedulers/test.py b/schedulers/test_scheduler.py similarity index 96% rename from schedulers/test.py rename to schedulers/test_scheduler.py index 03360ad..ebbf0d8 100644 --- a/schedulers/test.py +++ b/schedulers/test_scheduler.py @@ -1,17 +1,17 @@ -import unittest -import torch -from torch import nn -from optimizers import get_optimizer -from schedulers import AutoScheduler,scheduler_mapping,get_scheduler -import numpy as np - - -class testScheduler(unittest.TestCase): - def test_keys_lower_case(self): - for schechduler in scheduler_mapping: - self.assertEqual(schechduler,schechduler.lower()) - - - -if __name__=="__main__": - unittest.main() +import unittest +import torch +from torch import nn +from optimizers import get_optimizer +from schedulers import AutoScheduler,scheduler_mapping,get_scheduler +import numpy as np + + +class testScheduler(unittest.TestCase): + def test_keys_lower_case(self): + for schechduler in scheduler_mapping: + self.assertEqual(schechduler,schechduler.lower()) + + + +if __name__=="__main__": + unittest.main() From 7873f9a1aea2ab2003a1cbfa12d444b0f94f5f57 Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Mon, 1 Jan 2024 16:57:53 +0200 Subject: [PATCH 19/20] re-moving utility to inside TobeChecked --- {utility => ToBeChecked/utility}/__init__.py | 0 {utility => ToBeChecked/utility}/modify_contact_spacing.py | 0 {utility => ToBeChecked/utility}/utils.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {utility => ToBeChecked/utility}/__init__.py (100%) rename {utility => ToBeChecked/utility}/modify_contact_spacing.py (100%) rename {utility => ToBeChecked/utility}/utils.py (100%) diff --git a/utility/__init__.py b/ToBeChecked/utility/__init__.py similarity index 100% rename from utility/__init__.py rename to ToBeChecked/utility/__init__.py diff --git a/utility/modify_contact_spacing.py b/ToBeChecked/utility/modify_contact_spacing.py similarity index 100% rename from utility/modify_contact_spacing.py rename to ToBeChecked/utility/modify_contact_spacing.py diff --git a/utility/utils.py b/ToBeChecked/utility/utils.py similarity index 100% rename from utility/utils.py rename to ToBeChecked/utility/utils.py From dbaa47378a72b35288fdfae5ae8be655400a20d8 Mon Sep 17 00:00:00 2001 From: aghand0ur Date: Mon, 1 Jan 2024 17:09:38 +0200 Subject: [PATCH 20/20] Update metrics.py --- ToBeChecked/metrics/metrics.py | 51 ---------------------------------- 1 file changed, 51 deletions(-) diff --git a/ToBeChecked/metrics/metrics.py b/ToBeChecked/metrics/metrics.py index 844eddb..b37d2b0 100644 --- a/ToBeChecked/metrics/metrics.py +++ b/ToBeChecked/metrics/metrics.py @@ -199,57 +199,6 @@ def __call__(self,prediction,target): -def hard_dice_coef_mask(y_true, y_pred, smooth=1e-3): - y_true_f = K.flatten(K.round(y_true[..., 0])) - y_pred_f = K.flatten(K.round(y_pred[..., 0])) - intersection = K.sum(y_true_f * y_pred_f) - return 100. * (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) - - -def hard_jacard_coef_mask(y_true, y_pred, smooth=1e-3): - # K.flatten(K.round(y_true[..., 0])) - y_true_f = K.flatten(K.round(y_true[..., 0])) - y_pred_f =K.flatten(K.round(y_pred[..., 0])) - intersection = K.sum(y_true_f * y_pred_f) - return (intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) - intersection + smooth) - -def Jaccard_micro_building(y_true, y_pred): - y_true_f = K.flatten(K.round(y_true[..., 0])) - y_pred_f =K.flatten(K.round(y_pred[..., 0])) - tp=K.sum(y_true_f*y_pred_f) - fn=K.sum(y_true_f*(1.- y_pred_f)) - fp=K.sum((1. - y_true_f)*y_pred_f) - return (tp+1e-3)/(tp+fn+fp+1e-3) - - -def hard_dice_coef_border(y_true, y_pred, smooth=1e-3): - y_true_f = K.flatten(K.round(y_true[..., 1])) - y_pred_f = K.flatten(K.round(y_pred[..., 1])) - intersection = K.sum(y_true_f * y_pred_f) - return 100. * (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) - - -def hard_jacard_coef_border(y_true, y_pred, smooth=1e-3): - # K.flatten(K.round(y_true[..., 0])) - y_true_f = K.flatten(K.round(y_true[..., 1])) - y_pred_f =K.flatten(K.round(y_pred[..., 1])) - intersection = K.sum(y_true_f * y_pred_f) - return 100.0 * (intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) - intersection + smooth) - -def hard_dice_coef_spacing(y_true, y_pred, smooth=1e-3): - y_true_f = K.flatten(K.round(y_true[..., 2])) - y_pred_f = K.flatten(K.round(y_pred[..., 2])) - intersection = K.sum(y_true_f * y_pred_f) - return 100. * (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) - - -def hard_jacard_coef_spacing(y_true, y_pred, smooth=1e-3): - # K.flatten(K.round(y_true[..., 0])) - y_true_f = K.flatten(K.round(y_true[..., 2])) - y_pred_f =K.flatten(K.round(y_pred[..., 2])) - intersection = K.sum(y_true_f * y_pred_f) - return 100.0 * (intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) - intersection + smooth) - # def calc_iou(gt_masks, predicted_masks, height=768, width=768): # true_objects = gt_masks.shape[2]