Traceable RMSNorm (#1861)

yanboliang · web-flow · commit ac8214ee6ba7 · 2024-11-28T20:02:49.000+09:00
diff --git a/apex/normalization/fused_layer_norm.py b/apex/normalization/fused_layer_norm.py
@@ -5,6 +5,7 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 from torch.nn import functional as F
+from typing import List, Tuple
 
 from apex._autocast_utils import _cast_if_autocast_enabled
 
@@ -91,6 +92,125 @@ def backward(ctx, grad_output):
         return grad_input, grad_weight, None, None, None
 
 
+@torch.library.custom_op("apex::fused_rms_norm_affine_fwd", mutates_args=())
+def fused_rms_norm_affine_fwd(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    normalized_shape: List[int],
+    eps: float,
+    memory_efficient: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    global fused_layer_norm_cuda
+    if fused_layer_norm_cuda is None:
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+    input_ = input.contiguous()
+    weight_ = weight.contiguous()
+    output, invvar = fused_layer_norm_cuda.rms_forward_affine(
+        input_, normalized_shape, weight_, eps
+    )
+    return output, invvar
+
+
+@fused_rms_norm_affine_fwd.register_fake
+def fused_rms_norm_affine_fwd_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    normalized_shape: List[int],
+    eps: float,
+    memory_efficient: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    input = input.contiguous()
+    weight = weight.contiguous()
+    idiff = input.ndim - len(normalized_shape)
+    n = 1
+    for i in range(idiff):
+        n *= input.shape[i]
+    if input.dtype in [torch.float16, torch.bfloat16]:
+        dtype = torch.float32
+    else:
+        dtype = input.dtype
+    return (
+        torch.empty_like(input),
+        torch.empty(
+            [n],
+            dtype=dtype,
+            device=input.device,
+            requires_grad=input.requires_grad,
+            memory_format=torch.contiguous_format,
+        ),
+    )
+
+
+@torch.library.custom_op("apex::fused_rms_norm_affine_bwd", mutates_args=())
+def fused_rms_norm_affine_bwd(
+    grad_output: torch.Tensor,
+    invvar: torch.Tensor,
+    input_or_output: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    eps: float,
+    memory_efficient: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input, grad_weight = fused_layer_norm_cuda.rms_backward_affine(
+        grad_output.contiguous(),
+        invvar,
+        input_or_output,
+        normalized_shape,
+        weight,
+        eps,
+        memory_efficient,
+    )
+    return grad_input, grad_weight
+
+
+@fused_rms_norm_affine_bwd.register_fake
+def fused_rms_norm_affine_bwd_fake(
+    grad_output: torch.Tensor,
+    invvar: torch.Tensor,
+    input_or_output: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    eps: float,
+    memory_efficient: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grad_input = torch.empty_like(input_or_output)
+    grad_weight = torch.empty_like(weight)
+    return grad_input, grad_weight
+
+
+def backward(ctx, grad_output, grad_invvar):
+    input_or_output, weight_, invvar = ctx.saved_tensors
+    grad_input = grad_weight = None
+    grad_input, grad_weight = fused_rms_norm_affine_bwd(
+        grad_output,
+        invvar,
+        input_or_output,
+        ctx.normalized_shape,
+        weight_,
+        ctx.eps,
+        ctx.memory_efficient,
+    )
+    return grad_input, grad_weight, None, None, None
+
+
+def setup_context(ctx, inputs, output):
+    input_, weight_, normalized_shape, eps, memory_efficient = inputs
+    output_, invvar = output
+    input_ = input_.contiguous()
+    weight_ = weight_.contiguous()
+    if memory_efficient:
+        ctx.save_for_backward(output_, weight_, invvar)
+    else:
+        ctx.save_for_backward(input_, weight_, invvar)
+    ctx.normalized_shape = normalized_shape
+    ctx.eps = eps
+    ctx.memory_efficient = memory_efficient
+
+
+fused_rms_norm_affine_fwd.register_autograd(backward, setup_context=setup_context)
+
+
 class FusedLayerNormAffineMixedDtypesFunction(FusedLayerNormAffineFunction):
 
     @staticmethod
@@ -212,7 +332,7 @@ def mixed_dtype_fused_layer_norm_affine(input, weight, bias, normalized_shape, e
 def fused_rms_norm_affine(input, weight, normalized_shape, eps=1e-6, memory_efficient=False):
     args = _cast_if_autocast_enabled(input, weight, normalized_shape, eps, memory_efficient)
     with torch.amp.autocast('cuda', enabled=False):
-        return FusedRMSNormAffineFunction.apply(*args)
+        return fused_rms_norm_affine_fwd(*args)[0]
 
 
 def fused_rms_norm(input, normalized_shape, eps=1e-6, memory_efficient=False):
diff --git a/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py b/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
@@ -309,6 +309,29 @@ def test_layer_norm_export(self):
         native_x, fused_x = _prep_inputs(batch_size, normalized_shape, torch.float32)
         self._verify_export(fused, fused_x)
         self._verify_export(fused_m, fused_x)
+
+    def test_compile_fused_rms_norm(self):
+        batch_size = 16
+        normalized_shape = [32, 16]
+        eager_mod = FusedRMSNorm(
+            normalized_shape=normalized_shape, elementwise_affine=True
+        ).cuda()
+        compiled_mod = torch.compile(fullgraph=True)(eager_mod)
+        input_shape = [batch_size] + normalized_shape
+        eager_x = torch.randn(input_shape, device="cuda").requires_grad_(True)
+        compiled_x = eager_x.detach().clone().requires_grad_(True)
+
+        expected = eager_mod(eager_x)
+        actual = compiled_mod(compiled_x)
+        torch.testing.assert_close(actual, expected.detach())
+
+        g_eager = torch.rand_like(expected)
+        with torch.no_grad():
+            g_compiled = g_eager.detach().clone()
+        expected.backward(g_eager)
+        actual.backward(g_compiled)
+
+        torch.testing.assert_close(eager_x.grad, compiled_x.grad)
         
 instantiate_device_type_tests(TestFusedLayerNorm, globals(), only_for=("cuda",))
 if __name__ == "__main__":