pyg-team · rusty1s · May 31, 2023 · May 30, 2023 · May 30, 2023 · May 31, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `metis` partitioning ([#229](https://github.com/pyg-team/pyg-lib/pull/229))
 - Enable `hetero_neighbor_samplee` to work in parallel ([#211](https://github.com/pyg-team/pyg-lib/pull/211))
 ### Changed
+- Updated `cutlass` version for speed boosts in `segment_matmul` and `grouped_matmul` ([#235](https://github.com/pyg-team/pyg-lib/pull/235))
 - Drop nested tensor wrapper for `grouped_matmul` implementation ([#226](https://github.com/pyg-team/pyg-lib/pull/226))
 - Fixed TorchScript support in `grouped_matmul` ([#220](https://github.com/pyg-team/pyg-lib/pull/220))
 ### Removed

diff --git a/pyg_lib/csrc/ops/cuda/matmul_kernel.cu b/pyg_lib/csrc/ops/cuda/matmul_kernel.cu
@@ -147,8 +147,7 @@ void grouped_matmul_out_kernel(const at::TensorList input,
             float, 1, float, float>,                   //
         cutlass::gemm::threadblock::                   // Swizzling Operator
         GemmIdentityThreadblockSwizzle<8>,             //
-        2,                                             // Stages
-        cutlass::arch::OpMultiplyAdd                   // Operation
+        2                                              // Stages
         >::GemmKernel;
     run_grouped_gemm<GemmKernel_Volta>(input, other, out, segment);
   } else {
@@ -184,8 +183,7 @@ void grouped_matmul_out_kernel(const at::TensorList input,
                   float, 1, float, float>,                   //
               cutlass::gemm::threadblock::        // Swizzling Operator
               GemmIdentityThreadblockSwizzle<8>,  //
-              3,                                  // Stages
-              cutlass::arch::OpMultiplyAdd        // Operation
+              3                                   // Stages
               >::GemmKernel;
       int grouped_shared_mem =
           shared_memory_for_kernel<DefaultGemmKernel_TF32>();
@@ -217,8 +215,7 @@ void grouped_matmul_out_kernel(const at::TensorList input,
                     float, 1, float, float>,                   //
                 cutlass::gemm::threadblock::        // Swizzling Operator
                 GemmIdentityThreadblockSwizzle<8>,  //
-                3,                                  // Stages
-                cutlass::arch::OpMultiplyAdd        // Operation
+                3                                   // Stages
                 >::GemmKernel;
         run_grouped_gemm<SmallGemmKernel_TF32>(input, other, out, segment);
       }
@@ -246,8 +243,7 @@ void grouped_matmul_out_kernel(const at::TensorList input,
                   float, 1, float, float>,                   //
               cutlass::gemm::threadblock::        // Swizzling Operator
               GemmIdentityThreadblockSwizzle<8>,  //
-              3,                                  // Stages
-              cutlass::arch::OpMultiplyAdd        // Operation
+              3                                   // Stages
               >::GemmKernel;
       int grouped_shared_mem =
           shared_memory_for_kernel<DefaultGemmKernel_FP32>();
@@ -279,8 +275,7 @@ void grouped_matmul_out_kernel(const at::TensorList input,
                     float, 1, float, float>,                   //
                 cutlass::gemm::threadblock::        // Swizzling Operator
                 GemmIdentityThreadblockSwizzle<8>,  //
-                3,                                  // Stages
-                cutlass::arch::OpMultiplyAdd        // Operation
+                3                                   // Stages
                 >::GemmKernel;
         run_grouped_gemm<SmallGemmKernel_FP32>(input, other, out, segment);
       }

diff --git a/setup.cfg b/setup.cfg
@@ -16,7 +16,7 @@ classifiers=
 test=pytest
 
 [tool:pytest]
-addopts=--capture=no
+addopts=--capture=no --ignore=third_party
 
 [flake8]
 ignore=E731

diff --git a/third_party/cutlass b/third_party/cutlass