metaopt · Benjamin-eecs · Jul 22, 2023 · Jun 18, 2023 · Jun 19, 2023 · Jun 19, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,10 +24,6 @@ repos:
       - id: detect-private-key
       - id: debug-statements
       - id: double-quote-string-fixer
-  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.6
-    hooks:
-      - id: clang-format
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.0.278
     hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
--
+- Implement `Adadelta`, `RAdam`, `Adamax` optimizer by [@JieRen98](https://github.com/JieRen98) and [@Benjamin-eecs](https://github.com/Benjamin-eecs) in [#171](https://github.com/metaopt/torchopt/pull/171).
 
 ### Changed
 

diff --git a/docs/source/api/api.rst b/docs/source/api/api.rst
@@ -30,9 +30,12 @@ Functional Optimizers
 .. autosummary::
 
     FuncOptimizer
+    adadelta
     adagrad
     adam
     adamw
+    adamax
+    radam
     rmsprop
     sgd
 
@@ -42,6 +45,11 @@ Wrapper for Function Optimizer
 .. autoclass:: FuncOptimizer
     :members:
 
+Functional AdaDelta Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adadelta
+
 Functional AdaGrad Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -57,6 +65,16 @@ Functional AdamW Optimizer
 
 .. autofunction:: adamw
 
+Functional AdaMax Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adamax
+
+Functional RAdam Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: radam
+
 Functional RMSProp Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -76,12 +94,23 @@ Classic Optimizers
 
 .. autosummary::
 
+    AdaDelta
+    Adadelta
     AdaGrad
+    Adagrad
     Adam
     AdamW
+    AdaMax
+    Adamax
+    RAdam
     RMSProp
     SGD
 
+Classic AdaDelta Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaDelta
+
 Classic AdaGrad Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -97,6 +126,16 @@ Classic AdamW Optimizer
 
 .. autoclass:: AdamW
 
+Classic AdaMax Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaMax
+
+Classic RAdam Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: RAdam
+
 Classic RMSProp Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -116,12 +155,23 @@ Differentiable Meta-Optimizers
 
 .. autosummary::
 
+    MetaAdaDelta
+    MetaAdadelta
     MetaAdaGrad
+    MetaAdagrad
     MetaAdam
     MetaAdamW
+    MetaAdaMax
+    MetaAdamax
+    MetaRAdam
     MetaRMSProp
     MetaSGD
 
+Differentiable Meta-AdaDelta Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MetaAdaDelta
+
 Differentiable Meta-AdaGrad Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -137,6 +187,16 @@ Differentiable Meta-AdamW Optimizer
 
 .. autoclass:: MetaAdamW
 
+Differentiable Meta-AdaMax Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MetaAdaMax
+
+Differentiable Meta-RAdam Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MetaRAdam
+
 Differentiable Meta-RMSProp Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/explicit_diff/explicit_diff.rst b/docs/source/explicit_diff/explicit_diff.rst
@@ -53,9 +53,15 @@ For PyTorch-like API (e.g., ``step()``), we designed a base class :class:`torcho
 .. autosummary::
 
     torchopt.MetaOptimizer
+    torchopt.MetaAdaDelta
+    torchopt.MetaAdadelta
     torchopt.MetaAdaGrad
+    torchopt.MetaAdagrad
     torchopt.MetaAdam
     torchopt.MetaAdamW
+    torchopt.AdaMax
+    torchopt.MetaAdamax
+    torchopt.MetaRAdam
     torchopt.MetaRMSProp
     torchopt.MetaSGD
 

diff --git a/docs/source/optimizer/optim.rst b/docs/source/optimizer/optim.rst
@@ -18,9 +18,12 @@ Currently, TorchOpt supports 4 functional optimizers: :func:`sgd`, :func:`adam`,
 .. autosummary::
 
     torchopt.FuncOptimizer
+    torchopt.adadelta
     torchopt.adagrad
     torchopt.adam
     torchopt.adamw
+    torchopt.adamax
+    torchopt.radam
     torchopt.rmsprop
     torchopt.sgd
 
@@ -85,9 +88,15 @@ We offer original PyTorch APIs (e.g., ``zero_grad()`` or ``step()``) for traditi
 .. autosummary::
 
     torchopt.Optimizer
+    torchopt.AdaDelta
+    torchopt.Adadelta
     torchopt.AdaGrad
+    torchopt.Adagrad
     torchopt.Adam
     torchopt.AdamW
+    torchopt.AdaMax
+    torchopt.Adamax
+    torchopt.RAdam
     torchopt.RMSProp
     torchopt.SGD
 

diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
@@ -175,3 +175,10 @@ ctx
 Duchi
 invertible
 AdaGrad
+Adadelta
+Zeiler
+radam
+adamax
+RAdam
+AdaDelta
+AdaMax
diff --git a/tests/test_alias.py b/tests/test_alias.py
@@ -144,6 +144,63 @@ def test_sgd(
     _set_use_chain_flat(True)
 
 
+@helpers.parametrize(
+    dtype=[torch.float64],
+    lr=[1e-2, 1e-3, 1e-4],
+    rho=[0.9, 0.95],
+    eps=[1e-8],
+    inplace=[True, False],
+    weight_decay=[0.0, 1e-2],
+    use_chain_flat=[True, False],
+)
+def test_adadelta(
+    dtype: torch.dtype,
+    lr: float,
+    rho: float,
+    eps: float,
+    inplace: bool,
+    weight_decay: float,
+    use_chain_flat: bool,
+) -> None:
+    _set_use_chain_flat(use_chain_flat)
+
+    model, model_ref, model_base, loader = helpers.get_models(device='cpu', dtype=dtype)
+
+    fmodel, params, buffers = functorch.make_functional_with_buffers(model)
+    optim = torchopt.adadelta(
+        lr,
+        rho=rho,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+    optim_state = optim.init(params)
+    optim_ref = torch.optim.Adadelta(
+        model_ref.parameters(),
+        lr,
+        rho=rho,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+
+    for xs, ys in loader:
+        xs = xs.to(dtype=dtype)
+        pred = fmodel(params, buffers, xs)
+        pred_ref = model_ref(xs)
+        loss = F.cross_entropy(pred, ys)
+        loss_ref = F.cross_entropy(pred_ref, ys)
+
+        grads = torch.autograd.grad(loss, params, allow_unused=True)
+        updates, optim_state = optim.update(grads, optim_state, params=params, inplace=inplace)
+        params = torchopt.apply_updates(params, updates, inplace=inplace)
+
+        optim_ref.zero_grad()
+        loss_ref.backward()
+        optim_ref.step()
+
+    helpers.assert_model_all_close((params, buffers), model_ref, model_base, dtype=dtype)
+    _set_use_chain_flat(True)
+
+
 @helpers.parametrize(
     dtype=[torch.float64],
     lr=[1e-2, 1e-3, 1e-4],
@@ -210,6 +267,120 @@ def test_adam(
     _set_use_chain_flat(True)
 
 
+@helpers.parametrize(
+    dtype=[torch.float64],
+    lr=[1e-2, 1e-3, 1e-4],
+    betas=[(0.9, 0.999), (0.95, 0.9995)],
+    eps=[1e-8],
+    inplace=[True, False],
+    weight_decay=[0.0, 1e-2],
+    use_chain_flat=[True, False],
+)
+def test_radam(
+    dtype: torch.dtype,
+    lr: float,
+    betas: tuple[float, float],
+    eps: float,
+    inplace: bool,
+    weight_decay: float,
+    use_chain_flat: bool,
+) -> None:
+    _set_use_chain_flat(use_chain_flat)
+
+    model, model_ref, model_base, loader = helpers.get_models(device='cpu', dtype=dtype)
+
+    fmodel, params, buffers = functorch.make_functional_with_buffers(model)
+    optim = torchopt.radam(
+        lr,
+        betas=betas,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+    optim_state = optim.init(params)
+    optim_ref = torch.optim.RAdam(
+        model_ref.parameters(),
+        lr,
+        betas=betas,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+
+    for xs, ys in loader:
+        xs = xs.to(dtype=dtype)
+        pred = fmodel(params, buffers, xs)
+        pred_ref = model_ref(xs)
+        loss = F.cross_entropy(pred, ys)
+        loss_ref = F.cross_entropy(pred_ref, ys)
+
+        grads = torch.autograd.grad(loss, params, allow_unused=True)
+        updates, optim_state = optim.update(grads, optim_state, params=params, inplace=inplace)
+        params = torchopt.apply_updates(params, updates, inplace=inplace)
+
+        optim_ref.zero_grad()
+        loss_ref.backward()
+        optim_ref.step()
+
+    helpers.assert_model_all_close((params, buffers), model_ref, model_base, dtype=dtype)
+    _set_use_chain_flat(True)
+
+
+@helpers.parametrize(
+    dtype=[torch.float64],
+    lr=[1e-2, 1e-3, 1e-4],
+    betas=[(0.9, 0.999), (0.95, 0.9995)],
+    eps=[1e-8],
+    inplace=[True, False],
+    weight_decay=[0.0, 1e-2],
+    use_chain_flat=[True, False],
+)
+def test_adamax(
+    dtype: torch.dtype,
+    lr: float,
+    betas: tuple[float, float],
+    eps: float,
+    inplace: bool,
+    weight_decay: float,
+    use_chain_flat: bool,
+) -> None:
+    _set_use_chain_flat(use_chain_flat)
+
+    model, model_ref, model_base, loader = helpers.get_models(device='cpu', dtype=dtype)
+
+    fmodel, params, buffers = functorch.make_functional_with_buffers(model)
+    optim = torchopt.adamax(
+        lr,
+        betas=betas,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+    optim_state = optim.init(params)
+    optim_ref = torch.optim.Adamax(
+        model_ref.parameters(),
+        lr,
+        betas=betas,
+        eps=eps,
+        weight_decay=weight_decay,
+    )
+
+    for xs, ys in loader:
+        xs = xs.to(dtype=dtype)
+        pred = fmodel(params, buffers, xs)
+        pred_ref = model_ref(xs)
+        loss = F.cross_entropy(pred, ys)
+        loss_ref = F.cross_entropy(pred_ref, ys)
+
+        grads = torch.autograd.grad(loss, params, allow_unused=True)
+        updates, optim_state = optim.update(grads, optim_state, params=params, inplace=inplace)
+        params = torchopt.apply_updates(params, updates, inplace=inplace)
+
+        optim_ref.zero_grad()
+        loss_ref.backward()
+        optim_ref.step()
+
+    helpers.assert_model_all_close((params, buffers), model_ref, model_base, dtype=dtype)
+    _set_use_chain_flat(True)
+
+
 @helpers.parametrize(
     dtype=[torch.float64],
     outer_lr=[1e-2, 1e-3, 1e-4],