NVIDIA
diff --git a/‎apex/transformer/functional/fused_rope.py‎
Lines changed: 66 additions & 21 deletions b/‎apex/transformer/functional/fused_rope.py‎
Lines changed: 66 additions & 21 deletions
diff --git a/‎csrc/megatron/fused_rotary_positional_embedding.cpp‎
Lines changed: 72 additions & 23 deletions b/‎csrc/megatron/fused_rotary_positional_embedding.cpp‎
Lines changed: 72 additions & 23 deletions
@@ -17,22 +17,27 @@
 
 
 class FusedRoPEFunc(torch.autograd.Function):
-    """Fused RoPE function"""
+    """
+    Fused RoPE function
+
+    This implementation assumes the input tensor to be in `sbhd` format and the RoPE tensor to be
+    of shape (s, 1, 1, d). It accepts arbitrary memory layouts to avoid the expensive
+    `.contiguous()` calls, thus it may not achieve the best memory access pattern.
+    """
 
     @staticmethod
     def forward(
         ctx,
         t: torch.Tensor,
-        cos_: torch.Tensor,
-        sin_: torch.Tensor,
+        freqs: torch.Tensor,
         transpose_output_memory: bool = False,
     ) -> torch.Tensor:
         import fused_rotary_positional_embedding
 
         output = fused_rotary_positional_embedding.forward(
-            t, cos_, sin_, transpose_output_memory
+            t, freqs, transpose_output_memory
         )
-        ctx.save_for_backward(cos_, sin_)
+        ctx.save_for_backward(freqs)
         ctx.transpose_output_memory = transpose_output_memory
 
         return output
@@ -43,12 +48,12 @@ def backward(
     ) -> Tuple[Union[torch.Tensor, None], ...]:
         import fused_rotary_positional_embedding
 
-        cos_, sin_ = ctx.saved_tensors
+        (freqs,) = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward(
-            grad_output, cos_, sin_, ctx.transpose_output_memory
+            grad_output, freqs, ctx.transpose_output_memory
         )
 
-        return grad_input, None, None, None
+        return grad_input, None, None
 
 
 def fused_apply_rotary_pos_emb(
@@ -59,39 +64,79 @@ def fused_apply_rotary_pos_emb(
     """Apply rotary positional embedding to input tensor T.
 
     Args:
-        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+        t (Tensor): Input tensor T is of shape [s, b, h, d]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [s, 1, 1, d] and
+        `float` dtype
         transpose_output_memory (bool): Default to False. Whether to transpose the 's' and 'b'
         dimension of the output's underlying memory format. This is very helpful when you want to
         get a contiguous tensor after calling `output.transpose(0, 1)`.
 
     Returns:
         Tensor: The input tensor after applying RoPE
     """
-    cos_ = torch.cos(freqs).to(t.dtype)
-    sin_ = torch.sin(freqs).to(t.dtype)
-    return FusedRoPEFunc.apply(t, cos_, sin_, transpose_output_memory)
+    return FusedRoPEFunc.apply(t, freqs, transpose_output_memory)
+
+
+class FusedRoPECachedFunc(torch.autograd.Function):
+    """
+    Fused RoPE function
+
+    This implementation assumes the input tensor to be in `sbhd` format and the RoPE tensor to be
+    of shape (s, 1, 1, d). It accepts arbitrary memory layouts to avoid the expensive
+    `.contiguous()` calls, thus it may not achieve the best memory access pattern.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        t: torch.Tensor,
+        cos_: torch.Tensor,
+        sin_: torch.Tensor,
+        transpose_output_memory: bool = False,
+    ) -> torch.Tensor:
+        import fused_rotary_positional_embedding
+
+        output = fused_rotary_positional_embedding.forward_cached(
+            t, cos_, sin_, transpose_output_memory
+        )
+        ctx.save_for_backward(cos_, sin_)
+        ctx.transpose_output_memory = transpose_output_memory
+
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[Union[torch.Tensor, None], ...]:
+        import fused_rotary_positional_embedding
+
+        cos_, sin_ = ctx.saved_tensors
+        grad_input = fused_rotary_positional_embedding.backward_cached(
+            grad_output, cos_, sin_, ctx.transpose_output_memory
+        )
+
+        return grad_input, None, None, None
 
 
 def fused_apply_rotary_pos_emb_cached(
     t: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
+    cos_: torch.Tensor,
+    sin_: torch.Tensor,
     transpose_output_memory: bool = False,
 ) -> torch.Tensor:
     """Apply rotary positional embedding to input tensor T.
 
     Args:
-        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
-        cos (Tensor): Cached cosine of the rotary positional embedding tensor is of shape [seq_length, ..., dim]
-        sin (Tensor): Cached sine of the rotary positional embedding tensor is of shape [seq_length, ..., dim]
+        t (Tensor): Input tensor T is of shape [s, b, h, d]
+        cos_ (Tensor): Cached cosine of the rotary positional embedding tensor is of
+        shape [s, 1, 1, d] and dtype either `float` or the same as `t`.
+        sin_ (Tensor): Cached sine of the rotary positional embedding tensor is of
+        shape [s, 1, 1, d] and dtype either `float` or the same as `t`.
         transpose_output_memory (bool): Default to False. Whether to transpose the 's' and 'b'
         dimension of the output's underlying memory format. This is very helpful when you want to
         get a contiguous tensor after calling `output.transpose(0, 1)`.
 
     Returns:
         Tensor: The input tensor after applying RoPE
     """
-    cos_ = cos.to(t.dtype)
-    sin_ = sin.to(t.dtype)
-    return FusedRoPEFunc.apply(t, cos_, sin_, transpose_output_memory)
+    return FusedRoPECachedFunc.apply(t, cos_, sin_, transpose_output_memory)
@@ -18,15 +18,59 @@
 
 namespace fused_rope {
 
-torch::Tensor fwd_cuda(const torch::Tensor &input, const torch::Tensor &cos,
-                       const torch::Tensor &sin, const bool transpose_output);
+torch::Tensor fwd_cuda(const torch::Tensor &input, const torch::Tensor &freqs,
+                       const bool transpose_output);
 
 torch::Tensor bwd_cuda(const torch::Tensor &output_grads,
-                       const torch::Tensor &cos, const torch::Tensor &sin,
-                       const bool transpose_output);
+                       const torch::Tensor &freqs, const bool transpose_output);
+
+torch::Tensor fwd_cached_cuda(const torch::Tensor &input,
+                              const torch::Tensor &cos,
+                              const torch::Tensor &sin,
+                              const bool transpose_output);
+
+torch::Tensor bwd_cached_cuda(const torch::Tensor &output_grads,
+                              const torch::Tensor &cos,
+                              const torch::Tensor &sin,
+                              const bool transpose_output);
 
-torch::Tensor fwd(const at::Tensor &input, const at::Tensor &cos,
-                  const at::Tensor &sin, const bool transpose_output) {
+torch::Tensor fwd(const at::Tensor &input, const at::Tensor &freqs,
+                  const bool transpose_output) {
+  TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
+  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
+  TORCH_CHECK(input.size(0) == freqs.size(0),
+              "expected input and freqs tensor have the same sequence length");
+  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
+              "expected the second and third dims of the freqs tensor equal 1");
+  TORCH_CHECK(input.size(3) >= freqs.size(3),
+              "expected the last dim of the input tensor equals or is "
+              "greater than the freqs tensor");
+  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float,
+              "Dtype of the freqs tensor must be float");
+
+  return fwd_cuda(input, freqs, transpose_output);
+}
+
+torch::Tensor bwd(const torch::Tensor &output_grads, const at::Tensor &freqs,
+                  const bool transpose_output) {
+  TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
+  TORCH_CHECK(freqs.dim() == 4, "expected 4D tensor");
+  TORCH_CHECK(
+      output_grads.size(0) == freqs.size(0),
+      "expected output_grads and freqs tensor have the same sequence length");
+  TORCH_CHECK(freqs.size(1) == 1 && freqs.size(2) == 1,
+              "expected the second and third dims of the freqs tensor equal 1");
+  TORCH_CHECK(output_grads.size(3) >= freqs.size(3),
+              "expected the last dim of the output_grads tensor equals or is "
+              "greater than the freqs tensor");
+  TORCH_CHECK(freqs.scalar_type() == at::ScalarType::Float,
+              "Dtype of the freqs tensor must be float");
+
+  return bwd_cuda(output_grads, freqs, transpose_output);
+}
+
+torch::Tensor fwd_cached(const at::Tensor &input, const at::Tensor &cos,
+                         const at::Tensor &sin, const bool transpose_output) {
   TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
@@ -38,18 +82,20 @@ torch::Tensor fwd(const at::Tensor &input, const at::Tensor &cos,
               "expected the second and third dims of the cos tensor equal 1");
   TORCH_CHECK(sin.size(1) == 1 && sin.size(2) == 1,
               "expected the second and third dims of the sin tensor equal 1");
+  TORCH_CHECK(cos.size(3) == sin.size(3),
+              "expected cos and sin tensor have the same last dim");
   TORCH_CHECK(input.size(3) >= cos.size(3),
-              "expected the last dim of the input tensor is greater than the "
-              "cos tensor");
-  TORCH_CHECK(input.size(3) >= sin.size(3),
-              "expected the last dim of the input tensor is greater than the "
-              "sin tensor");
+              "expected the last dim of the input tensor equals or is "
+              "greater than the cos tensor");
+  TORCH_CHECK(cos.scalar_type() == sin.scalar_type(),
+              "expected cos and sin tensor have the same dtype");
 
-  return fwd_cuda(input, cos, sin, transpose_output);
+  return fwd_cached_cuda(input, cos, sin, transpose_output);
 }
 
-torch::Tensor bwd(const torch::Tensor &output_grads, const at::Tensor &cos,
-                  const at::Tensor &sin, const bool transpose_output) {
+torch::Tensor bwd_cached(const torch::Tensor &output_grads,
+                         const at::Tensor &cos, const at::Tensor &sin,
+                         const bool transpose_output) {
   TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
@@ -63,16 +109,15 @@ torch::Tensor bwd(const torch::Tensor &output_grads, const at::Tensor &cos,
               "expected the second and third dims of the cos tensor equal 1");
   TORCH_CHECK(sin.size(1) == 1 && sin.size(2) == 1,
               "expected the second and third dims of the sin tensor equal 1");
-  TORCH_CHECK(
-      output_grads.size(3) >= cos.size(3),
-      "expected the last dim of the output_grads tensor is greater than the "
-      "cos tensor");
-  TORCH_CHECK(
-      output_grads.size(3) >= sin.size(3),
-      "expected the last dim of the output_grads tensor is greater than the "
-      "sin tensor");
+  TORCH_CHECK(cos.size(3) == sin.size(3),
+              "expected cos and sin tensor have the same last dim");
+  TORCH_CHECK(output_grads.size(3) >= cos.size(3),
+              "expected the last dim of the output_grads tensor equals or is "
+              "greater than the cos tensor");
+  TORCH_CHECK(cos.scalar_type() == sin.scalar_type(),
+              "expected cos and sin tensor have the same dtype");
 
-  return bwd_cuda(output_grads, cos, sin, transpose_output);
+  return bwd_cached_cuda(output_grads, cos, sin, transpose_output);
 }
 
 }  // end namespace fused_rope
@@ -82,4 +127,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "Fused Rotary Positional Embedding -- Forward.");
   m.def("backward", &fused_rope::bwd,
         "Fused Rotary Positional Embedding -- Backward.");
+  m.def("forward_cached", &fused_rope::fwd_cached,
+        "Fused Rotary Positional Embedding Cached -- Forward.");
+  m.def("backward_cached", &fused_rope::bwd_cached,
+        "Fused Rotary Positional Embedding Cached -- Backward.");
 }