[FMHA] Remove zero fill for softmax output (#1565)

vasunvidia · web-flow · commit b1c76000d4b9 · 2023-03-06T18:28:41.000-08:00
* Remove zero fill for softmax output

* Initial commit for mha_fill_kernel
diff --git a/apex/contrib/csrc/fmha/fmha_api.cpp b/apex/contrib/csrc/fmha/fmha_api.cpp
@@ -30,6 +30,7 @@
 
 #include "fmha.h"
 
+extern at::Tensor & mha_fill(at::Tensor &self, const at::Tensor &start_index);
 void set_params(Fused_multihead_attention_fprop_params &params,
                 // sizes
                 const size_t b,
@@ -93,6 +94,7 @@ mha_fwd(const at::Tensor &qkv,         // total x num_heads x 3 x head_size, tot
         const bool zero_tensors,
         c10::optional<at::Generator> gen_) {
 
+    using namespace torch::indexing;
     auto dprops = at::cuda::getCurrentDeviceProperties();
     TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) ||
                 (dprops->major == 9 && dprops->minor == 0));
@@ -143,8 +145,7 @@ mha_fwd(const at::Tensor &qkv,         // total x num_heads x 3 x head_size, tot
     auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
 
     if( zero_tensors ) {
-        ctx.zero_();
-        s.zero_();
+        mha_fill(ctx, cu_seqlens.index({Slice(-1,None)}));
     }
 
     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
@@ -189,6 +190,7 @@ mha_bwd(const at::Tensor &dout,  // total x num_heads, x head_size
         const int max_seq_len,          // max sequence length to choose the kernel
         const bool zero_tensors
 ) {
+    using namespace torch::indexing;
     auto dprops = at::cuda::getCurrentDeviceProperties();
     TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) ||
                 (dprops->major == 9 && dprops->minor == 0));
@@ -239,7 +241,7 @@ mha_bwd(const at::Tensor &dout,  // total x num_heads, x head_size
     auto dqkv = torch::empty_like(qkv);
 
     if( zero_tensors ) {
-        dqkv.zero_();
+        mha_fill(dqkv, cu_seqlens.index({Slice(-1,None)}));
     }
 
     Fused_multihead_attention_fprop_params params;
diff --git a/apex/contrib/csrc/fmha/src/fmha_fill.cu b/apex/contrib/csrc/fmha/src/fmha_fill.cu
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/Dispatch.h>
+
+constexpr int block_size = 512;
+constexpr int ctas_per_sm = 4;
+
+template <typename scalar_t>
+__global__ void
+__launch_bounds__(block_size)
+mha_fill_kernel(scalar_t* out_tensor,
+                const int32_t* const start_row,
+                const size_t num_rows) {
+    size_t row_stride = gridDim.y * blockDim.x;
+    size_t row_index = blockIdx.x + (size_t)start_row[0];
+    size_t col_index = blockIdx.y * blockDim.x + threadIdx.x;
+    while (row_index < num_rows) {
+        out_tensor[row_index*row_stride + col_index] = 0;
+        row_index += gridDim.x;
+    }
+}
+
+at::Tensor & mha_fill(at::Tensor &self, const at::Tensor &start_index) {
+    auto max_tokens = self.size(0);
+    auto self_2d = self.view({max_tokens, -1});
+    auto fcd_size = self_2d.size(1);
+    TORCH_CHECK (self.is_contiguous(), "input not contiguous");
+    TORCH_CHECK (fcd_size % block_size == 0, "input size not aligned to block size");
+    const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+    uint64_t num_blk_y = (uint64_t)(fcd_size / block_size);
+    uint64_t num_blk_x = (uint64_t)std::ceil(num_mp * ctas_per_sm / num_blk_y);
+    dim3 dim_grid(num_blk_x, num_blk_y);
+    dim3 dim_block(block_size);
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        at::ScalarType::Half, at::ScalarType::BFloat16, self_2d.scalar_type(), "mha_padding_fill_", [&]() {
+            mha_fill_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                self_2d.data_ptr<scalar_t>(), start_index.data_ptr<int32_t>(), max_tokens);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+    return self;
+}
diff --git a/setup.py b/setup.py
@@ -554,6 +554,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             name="fmhalib",
             sources=[
                 "apex/contrib/csrc/fmha/fmha_api.cpp",
+                "apex/contrib/csrc/fmha/src/fmha_fill.cu",
                 "apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu",
                 "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu",
                 "apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu",