NVIDIA
diff --git a/‎apex/transformer/functional/fused_rope.py‎
Lines changed: 32 additions & 8 deletions b/‎apex/transformer/functional/fused_rope.py‎
Lines changed: 32 additions & 8 deletions
diff --git a/‎csrc/megatron/fused_rotary_positional_embedding.cpp‎
Lines changed: 9 additions & 14 deletions b/‎csrc/megatron/fused_rotary_positional_embedding.cpp‎
Lines changed: 9 additions & 14 deletions
diff --git a/‎csrc/megatron/fused_rotary_positional_embedding.h‎
Lines changed: 68 additions & 47 deletions b/‎csrc/megatron/fused_rotary_positional_embedding.h‎
Lines changed: 68 additions & 47 deletions
@@ -17,14 +17,23 @@
 
 
 class FusedRoPEFunc(torch.autograd.Function):
+    """Fused RoPE function"""
+
     @staticmethod
     def forward(
-        ctx, t: torch.Tensor, cos_: torch.Tensor, sin_: torch.Tensor
+        ctx,
+        t: torch.Tensor,
+        cos_: torch.Tensor,
+        sin_: torch.Tensor,
+        transpose_output_memory: bool = False,
     ) -> torch.Tensor:
         import fused_rotary_positional_embedding
 
-        output = fused_rotary_positional_embedding.forward(t, cos_, sin_)
+        output = fused_rotary_positional_embedding.forward(
+            t, cos_, sin_, transpose_output_memory
+        )
         ctx.save_for_backward(cos_, sin_)
+        ctx.transpose_output_memory = transpose_output_memory
 
         return output
 
@@ -35,39 +44,54 @@ def backward(
         import fused_rotary_positional_embedding
 
         cos_, sin_ = ctx.saved_tensors
-        grad_q = fused_rotary_positional_embedding.backward(grad_output, cos_, sin_)
+        grad_input = fused_rotary_positional_embedding.backward(
+            grad_output, cos_, sin_, ctx.transpose_output_memory
+        )
 
-        return grad_q, None, None
+        return grad_input, None, None, None
 
 
-def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+def fused_apply_rotary_pos_emb(
+    t: torch.Tensor,
+    freqs: torch.Tensor,
+    transpose_output_memory: bool = False,
+) -> torch.Tensor:
     """Apply rotary positional embedding to input tensor T.
 
     Args:
         t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
         freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+        transpose_output_memory (bool): Default to False. Whether to transpose the 's' and 'b'
+        dimension of the output's underlying memory format. This is very helpful when you want to
+        get a contiguous tensor after calling `output.transpose(0, 1)`.
 
     Returns:
         Tensor: The input tensor after applying RoPE
     """
     cos_ = torch.cos(freqs).to(t.dtype)
     sin_ = torch.sin(freqs).to(t.dtype)
-    return FusedRoPEFunc.apply(t, cos_, sin_)
+    return FusedRoPEFunc.apply(t, cos_, sin_, transpose_output_memory)
 
 
 def fused_apply_rotary_pos_emb_cached(
-    t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+    t: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    transpose_output_memory: bool = False,
 ) -> torch.Tensor:
     """Apply rotary positional embedding to input tensor T.
 
     Args:
         t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
         cos (Tensor): Cached cosine of the rotary positional embedding tensor is of shape [seq_length, ..., dim]
         sin (Tensor): Cached sine of the rotary positional embedding tensor is of shape [seq_length, ..., dim]
+        transpose_output_memory (bool): Default to False. Whether to transpose the 's' and 'b'
+        dimension of the output's underlying memory format. This is very helpful when you want to
+        get a contiguous tensor after calling `output.transpose(0, 1)`.
 
     Returns:
         Tensor: The input tensor after applying RoPE
     """
     cos_ = cos.to(t.dtype)
     sin_ = sin.to(t.dtype)
-    return FusedRoPEFunc.apply(t, cos_, sin_)
+    return FusedRoPEFunc.apply(t, cos_, sin_, transpose_output_memory)
@@ -19,16 +19,14 @@
 namespace fused_rope {
 
 torch::Tensor fwd_cuda(const torch::Tensor &input, const torch::Tensor &cos,
-                       const torch::Tensor &sin);
+                       const torch::Tensor &sin, const bool transpose_output);
 
 torch::Tensor bwd_cuda(const torch::Tensor &output_grads,
-                       const torch::Tensor &cos, const torch::Tensor &sin);
+                       const torch::Tensor &cos, const torch::Tensor &sin,
+                       const bool transpose_output);
 
-torch::Tensor fwd(const at::Tensor &input_, const at::Tensor &cos_,
-                  const at::Tensor &sin_) {
-  auto input = input_.contiguous();
-  auto cos = cos_.contiguous();
-  auto sin = sin_.contiguous();
+torch::Tensor fwd(const at::Tensor &input, const at::Tensor &cos,
+                  const at::Tensor &sin, const bool transpose_output) {
   TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
@@ -47,14 +45,11 @@ torch::Tensor fwd(const at::Tensor &input_, const at::Tensor &cos_,
               "expected the last dim of the input tensor is greater than the "
               "sin tensor");
 
-  return fwd_cuda(input, cos, sin);
+  return fwd_cuda(input, cos, sin, transpose_output);
 }
 
-torch::Tensor bwd(const torch::Tensor &output_grads_, const at::Tensor &cos_,
-                  const at::Tensor &sin_) {
-  auto output_grads = output_grads_.contiguous();
-  auto cos = cos_.contiguous();
-  auto sin = sin_.contiguous();
+torch::Tensor bwd(const torch::Tensor &output_grads, const at::Tensor &cos,
+                  const at::Tensor &sin, const bool transpose_output) {
   TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(cos.dim() == 4, "expected 4D tensor");
   TORCH_CHECK(sin.dim() == 4, "expected 4D tensor");
@@ -77,7 +72,7 @@ torch::Tensor bwd(const torch::Tensor &output_grads_, const at::Tensor &cos_,
       "expected the last dim of the output_grads tensor is greater than the "
       "sin tensor");
 
-  return bwd_cuda(output_grads, cos, sin);
+  return bwd_cuda(output_grads, cos, sin, transpose_output);
 }
 
 }  // end namespace fused_rope
 
@@ -25,70 +25,83 @@
 namespace {
 
 template <typename scalar_t>
-__global__ void fused_rope_forward(int sq, int b, int np, int hn, int hn2,
+__global__ void fused_rope_forward(int h, int d, int d2, int stride_s,
+                                   int stride_b, int stride_h, int stride_d,
+                                   int o_stride_s, int o_stride_b,
+                                   int o_stride_h, int o_stride_d,
                                    const scalar_t* src, const scalar_t* cos,
                                    const scalar_t* sin, scalar_t* dst) {
-  int sq_id = blockIdx.x, b_id = blockIdx.y;
-  int offset_block = sq_id * b * np * hn + b_id * np * hn;
+  int s_id = blockIdx.x, b_id = blockIdx.y;
+  int offset_block = s_id * stride_s + b_id * stride_b;
+  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
 #pragma unroll
-  for (int hn_id = threadIdx.x; hn_id < hn2; hn_id += blockDim.x) {
-    scalar_t v_cos = cos[sq_id * hn2 + hn_id];
-    scalar_t v_sin = sin[sq_id * hn2 + hn_id];
+  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
+    scalar_t v_cos = cos[s_id * d2 + d_id];
+    scalar_t v_sin = sin[s_id * d2 + d_id];
 #pragma unroll
-    for (int head_id = threadIdx.y; head_id < np; head_id += blockDim.y) {
-      int offset_src_dst = offset_block + head_id * hn + hn_id;
-      scalar_t v_src = src[offset_src_dst];
-      scalar_t v_src_rotate = (hn_id + hn2 / 2 < hn2)
-                                  ? -src[offset_src_dst + hn2 / 2]
-                                  : src[offset_src_dst + hn2 / 2 - hn2];
-      dst[offset_src_dst] = v_src * v_cos + v_src_rotate * v_sin;
+    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
+      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
+      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
+      scalar_t v_src = src[offset_src];
+      scalar_t v_src_rotate = (d_id + d2 / 2 < d2)
+                                  ? -src[offset_src + (d2 / 2) * stride_d]
+                                  : src[offset_src + (d2 / 2 - d2) * stride_d];
+      dst[offset_dst] = v_src * v_cos + v_src_rotate * v_sin;
     }
   }
 
   // copy the rest
-  if (hn > hn2) {
+  if (d > d2) {
 #pragma unroll
-    for (int head_id = threadIdx.y; head_id < np; head_id += blockDim.y) {
-      int offset_head = offset_block + head_id * hn;
+    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
+      int offset_head = offset_block + h_id * stride_h;
+      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
 #pragma unroll
-      for (int hn_id = hn2 + threadIdx.x; hn_id < hn; hn_id += blockDim.x) {
-        dst[offset_head + hn_id] = src[offset_head + hn_id];
+      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
+        dst[offset_head_dst + d_id * o_stride_d] =
+            src[offset_head + d_id * stride_d];
       }
     }
   }
 }
 
 template <typename scalar_t>
-__global__ void fused_rope_backward(int sq, int b, int np, int hn, int hn2,
+__global__ void fused_rope_backward(int h, int d, int d2, int stride_s,
+                                    int stride_b, int stride_h, int stride_d,
+                                    int o_stride_s, int o_stride_b,
+                                    int o_stride_h, int o_stride_d,
                                     const scalar_t* src, const scalar_t* cos,
                                     const scalar_t* sin, scalar_t* dst) {
-  int sq_id = blockIdx.x, b_id = blockIdx.y;
-  int offset_block = sq_id * b * np * hn + b_id * np * hn;
+  int s_id = blockIdx.x, b_id = blockIdx.y;
+  int offset_block = s_id * stride_s + b_id * stride_b;
+  int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
 #pragma unroll
-  for (int hn_id = threadIdx.x; hn_id < hn2; hn_id += blockDim.x) {
-    scalar_t v_cos = cos[sq_id * hn2 + hn_id];
-    scalar_t v_sin = (hn_id + hn2 / 2 < hn2)
-                         ? sin[sq_id * hn2 + hn_id + hn2 / 2]
-                         : -sin[sq_id * hn2 + hn_id + hn2 / 2 - hn2];
+  for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
+    scalar_t v_cos = cos[s_id * d2 + d_id];
+    scalar_t v_sin = (d_id + d2 / 2 < d2)
+                         ? sin[s_id * d2 + d_id + d2 / 2]
+                         : -sin[s_id * d2 + d_id + d2 / 2 - d2];
 #pragma unroll
-    for (int head_id = threadIdx.y; head_id < np; head_id += blockDim.y) {
-      int offset_src_dst = offset_block + head_id * hn + hn_id;
-      scalar_t v_src = src[offset_src_dst];
-      scalar_t v_src_rotate = (hn_id + hn2 / 2 < hn2)
-                                  ? src[offset_src_dst + hn2 / 2]
-                                  : src[offset_src_dst + hn2 / 2 - hn2];
-      dst[offset_src_dst] = v_src * v_cos + v_src_rotate * v_sin;
+    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
+      int offset_src = offset_block + h_id * stride_h + d_id * stride_d;
+      int offset_dst = offset_block_dst + h_id * o_stride_h + d_id * o_stride_d;
+      scalar_t v_src = src[offset_src];
+      scalar_t v_src_rotate = (d_id + d2 / 2 < d2)
+                                  ? src[offset_src + (d2 / 2) * stride_d]
+                                  : src[offset_src + (d2 / 2 - d2) * stride_d];
+      dst[offset_dst] = v_src * v_cos + v_src_rotate * v_sin;
     }
   }
 
   // handle the tail
-  if (hn > hn2) {
+  if (d > d2) {
 #pragma unroll
-    for (int head_id = threadIdx.y; head_id < np; head_id += blockDim.y) {
-      int offset_head = offset_block + head_id * hn;
+    for (int h_id = threadIdx.y; h_id < h; h_id += blockDim.y) {
+      int offset_head = offset_block + h_id * stride_h;
+      int offset_head_dst = offset_block_dst + h_id * o_stride_h;
 #pragma unroll
-      for (int hn_id = hn2 + threadIdx.x; hn_id < hn; hn_id += blockDim.x) {
-        dst[offset_head + hn_id] = src[offset_head + hn_id];
+      for (int d_id = d2 + threadIdx.x; d_id < d; d_id += blockDim.x) {
+        dst[offset_head_dst + d_id * o_stride_d] = src[offset_head + d_id * stride_d];
       }
     }
   }
@@ -97,32 +110,40 @@ __global__ void fused_rope_backward(int sq, int b, int np, int hn, int hn2,
 }  // end of anonymous namespace
 
 template <typename scalar_t>
-void dispatch_fused_rope_forward(int sq, int b, int np, int hn, int hn2,
+void dispatch_fused_rope_forward(int s, int b, int h, int d, int d2,
+                                 int stride_s, int stride_b, int stride_h,
+                                 int stride_d, int o_stride_s, int o_stride_b,
+                                 int o_stride_h, int o_stride_d,
                                  const scalar_t* input, const scalar_t* cos,
                                  const scalar_t* sin, scalar_t* output) {
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  int warps_per_block = np < 16 ? 4 : 8;
-  dim3 blocks(sq, b);
+  int warps_per_block = h < 16 ? 4 : 8;
+  dim3 blocks(s, b);
   dim3 threads(C10_WARP_SIZE, warps_per_block);
 
-  fused_rope_forward<<<blocks, threads, 0, stream>>>(sq, b, np, hn, hn2, input,
-                                                     cos, sin, output);
+  fused_rope_forward<<<blocks, threads, 0, stream>>>(
+      h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
+      o_stride_h, o_stride_d, input, cos, sin, output);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t>
-void dispatch_fused_rope_backward(int sq, int b, int np, int hn, int hn2,
+void dispatch_fused_rope_backward(int s, int b, int h, int d, int d2,
+                                  int stride_s, int stride_b, int stride_h,
+                                  int stride_d, int o_stride_s, int o_stride_b,
+                                  int o_stride_h, int o_stride_d,
                                   const scalar_t* output_grads,
                                   const scalar_t* cos, const scalar_t* sin,
                                   scalar_t* input_grads) {
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  int warps_per_block = np < 16 ? 4 : 8;
-  dim3 blocks(sq, b);
+  int warps_per_block = h < 16 ? 4 : 8;
+  dim3 blocks(s, b);
   dim3 threads(C10_WARP_SIZE, warps_per_block);
 
   fused_rope_backward<<<blocks, threads, 0, stream>>>(
-      sq, b, np, hn, hn2, output_grads, cos, sin, input_grads);
+      h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
+      o_stride_h, o_stride_d, output_grads, cos, sin, input_grads);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }