clovaai · bhheo · Aug 27, 2020 · Aug 27, 2020
diff --git a/adamp/adamp.py b/adamp/adamp.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.optim.optimizer import Optimizer, required
 import math
 
@@ -26,11 +27,7 @@ def _cosine_similarity(self, x, y, eps, view_func):
         x = view_func(x)
         y = view_func(y)
 
-        x_norm = x.norm(dim=1).add_(eps)
-        y_norm = y.norm(dim=1).add_(eps)
-        dot = (x * y).sum(dim=1)
-
-        return dot.abs() / x_norm / y_norm
+        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()
 
     def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
         wd = 1
@@ -77,8 +74,8 @@ def step(self, closure=None):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
 
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 step_size = group['lr'] / bias_correction1
@@ -98,6 +95,6 @@ def step(self, closure=None):
                     p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)
 
                 # Step
-                p.data.add_(-step_size, perturb)
+                p.data.add_(perturb, alpha=-step_size)
 
         return loss
diff --git a/adamp/sgdp.py b/adamp/sgdp.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.optim.optimizer import Optimizer, required
 import math
 
@@ -26,11 +27,7 @@ def _cosine_similarity(self, x, y, eps, view_func):
         x = view_func(x)
         y = view_func(y)
 
-        x_norm = x.norm(dim=1).add_(eps)
-        y_norm = y.norm(dim=1).add_(eps)
-        dot = (x * y).sum(dim=1)
-
-        return dot.abs() / x_norm / y_norm
+        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()
 
     def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
         wd = 1
@@ -54,7 +51,6 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            weight_decay = group['weight_decay']
             momentum = group['momentum']
             dampening = group['dampening']
             nesterov = group['nesterov']
@@ -71,7 +67,7 @@ def step(self, closure=None):
 
                 # SGD
                 buf = state['momentum']
-                buf.mul_(momentum).add_(1 - dampening, grad)
+                buf.mul_(momentum).add_(grad, alpha=1 - dampening)
                 if nesterov:
                     d_p = grad + momentum * buf
                 else:
@@ -83,10 +79,10 @@ def step(self, closure=None):
                     d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps'])
 
                 # Weight decay
-                if weight_decay != 0:
+                if group['weight_decay'] > 0:
                     p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum))
 
                 # Step
-                p.data.add_(-group['lr'], d_p)
+                p.data.add_(d_p, alpha=-group['lr'])
 
         return loss