pyg-team · rusty1s · Jun 24, 2022 · May 30, 2022 · May 30, 2022 · May 30, 2022
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/cutlass"]
+    path = third_party/cutlass
+    url = https://github.com/NVIDIA/cutlass
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 ### Added
+- Added `grouped_matmul` and `segment_matmul` CUDA implementations via `cutlass` ([#51](https://github.com/pyg-team/pyg-lib/pull/51)
 - Added `pyg::sampler::neighbor_sample` interface ([#54](https://github.com/pyg-team/pyg-lib/pull/54)
 - Added `pyg::sampler::Mapper` utility for mapping global to local node indices ([#45](https://github.com/pyg-team/pyg-lib/pull/45)
 - Added benchmark script ([#45](https://github.com/pyg-team/pyg-lib/pull/45)

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,6 +19,11 @@ if(WITH_CUDA)
   enable_language(CUDA)
   add_definitions(-DWITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+
+  set(CUTLASS_DIR third_party/cutlass/include)
+  include_directories(${CUTLASS_DIR})
+  set(CUTLASS_UTIL_DIR third_party/cutlass/tools/util/include)
+  include_directories(${CUTLASS_UTIL_DIR})
 endif()
 
 set(CSRC pyg_lib/csrc)

diff --git a/pyg_lib/csrc/ops/cuda/matmul_kernel.cu b/pyg_lib/csrc/ops/cuda/matmul_kernel.cu
@@ -0,0 +1,158 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/library.h>
+
+#include <cutlass/gemm/device/gemm_grouped.h>
+#include <cutlass/gemm/kernel/default_gemm_grouped.h>
+#include <cutlass/util/host_tensor.h>
+
+namespace pyg {
+namespace ops {
+
+namespace {
+
+void grouped_matmul_out_kernel(const std::vector<at::Tensor>& input,
+                               const std::vector<at::Tensor>& other,
+                               const std::vector<at::Tensor>& out) {
+  // TODO (matthias) Check tensor devices.
+  // TODO (matthias) Check for contiguous memory.
+
+  // TODO (matthias) Allow for other types than `float`.
+  // TODO (matthias) Are these attributes correctly set?
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+      float,                                         // Element A
+      cutlass::layout::RowMajor,                     // Layout A
+      cutlass::ComplexTransform::kNone,              //
+      8,                                             // Granularity A
+      float,                                         // Element B
+      cutlass::layout::RowMajor,                     // Layout B
+      cutlass::ComplexTransform::kNone,              //
+      8,                                             // Granularity B
+      float,                                         // Element C&D
+      cutlass::layout::RowMajor,                     // Layout C&D
+      float,                                         // Element Accumulator
+      cutlass::arch::OpClassTensorOp,                // Operator Class Tag
+      cutlass::arch::Sm80,                           // Architecture
+      cutlass::gemm::GemmShape<256, 128, 32>,        // Threadblock-level Tile
+      cutlass::gemm::GemmShape<64, 64, 32>,          // Warp-level Tile
+      cutlass::gemm::GemmShape<16, 8, 8>,            // Warp-level Tile
+      cutlass::epilogue::thread::LinearCombination<  // Epilogue
+          float, 8, float, float>,                   //
+      cutlass::gemm::threadblock::                   // Swizzling Operator
+      GemmIdentityThreadblockSwizzle<8>,             //
+      2,                                             // Stages
+      cutlass::arch::OpMultiplyAdd                   // Operation
+      >::GemmKernel;
+
+  auto num_matrices = input.size();
+
+  std::vector<float*> ptr_A_host(num_matrices);
+  std::vector<float*> ptr_B_host(num_matrices);
+  std::vector<float*> ptr_C_host(num_matrices);
+
+  for (size_t i = 0; i < num_matrices; ++i) {
+    ptr_A_host[i] = input[i].data_ptr<float>();
+    ptr_B_host[i] = other[i].data_ptr<float>();
+    ptr_C_host[i] = out[i].data_ptr<float>();
+  }
+
+  cutlass::DeviceAllocation<float*> ptr_A;
+  ptr_A.reset(num_matrices);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  cutlass::DeviceAllocation<float*> ptr_B;
+  ptr_B.reset(num_matrices);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  cutlass::DeviceAllocation<float*> ptr_C;
+  ptr_C.reset(num_matrices);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  std::vector<cutlass::gemm::GemmCoord> all_problems(num_matrices);
+  std::vector<int64_t> ld_A_host(num_matrices);
+  std::vector<int64_t> ld_B_host(num_matrices);
+  std::vector<int64_t> ld_C_host(num_matrices);
+
+  for (size_t i = 0; i < num_matrices; ++i) {
+    auto m = input[i].size(0), k = input[i].size(1), n = out[i].size(1);
+    TORCH_CHECK(input[i].size(-1) == other[i].size(-2), "Shape mismatch");
+    all_problems[i] = cutlass::gemm::GemmCoord(m, n, k);
+    ld_A_host[i] = GemmKernel::LayoutA::packed({m, k}).stride(0);
+    ld_B_host[i] = GemmKernel::LayoutB::packed({k, n}).stride(0);
+    ld_C_host[i] = GemmKernel::LayoutC::packed({m, n}).stride(0);
+  }
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> all_problems_device;
+  all_problems_device.reset(num_matrices);
+  all_problems_device.copy_from_host(all_problems.data());
+
+  cutlass::DeviceAllocation<int64_t> ld_A;
+  ld_A.reset(num_matrices);
+  ld_A.copy_from_host(ld_A_host.data());
+
+  cutlass::DeviceAllocation<int64_t> ld_B;
+  ld_B.reset(num_matrices);
+  ld_B.copy_from_host(ld_B_host.data());
+
+  cutlass::DeviceAllocation<int64_t> ld_C;
+  ld_C.reset(num_matrices);
+  ld_C.copy_from_host(ld_C_host.data());
+
+  using EpilogueOutputOp = typename GemmKernel::Epilogue::OutputOp;
+  typename EpilogueOutputOp::Params epilogue_op(1.0, 0.0);
+
+  using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+  typename GemmGrouped::Arguments args(
+      all_problems_device.get(), num_matrices, /*threadblock_count=*/1024,
+      epilogue_op, ptr_A.get(), ptr_B.get(), ptr_C.get(), ptr_C.get(),
+      ld_A.get(), ld_B.get(), ld_C.get(), ld_C.get());
+
+  GemmGrouped gemm;
+  auto status = gemm.initialize(args);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "GroupedGEMM init failed");
+  status = gemm.run();
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "GroupedGEMM run failed");
+}
+
+std::vector<at::Tensor> grouped_matmul_kernel(
+    const std::vector<at::Tensor>& input,
+    const std::vector<at::Tensor>& other) {
+  std::vector<at::Tensor> out(input.size());
+  for (size_t i = 0; i < input.size(); ++i)
+    out[i] = input[i].new_empty({input[i].size(0), other[i].size(-1)});
+
+  grouped_matmul_out_kernel(input, other, out);
+
+  return out;
+}
+
+at::Tensor segment_matmul_kernel(const at::Tensor& input,
+                                 const at::Tensor& ptr,
+                                 const at::Tensor& other) {
+  auto size = ptr.narrow(/*dim=*/0, /*start=*/1, /*length=*/ptr.numel() - 1) -
+              ptr.narrow(/*dim=*/0, /*start=*/0, /*length=*/ptr.numel() - 1);
+  size = size.cpu();  // `at::split` requires CPU-allocated data.
+  // TODO (matthias) Allow for other types than `int64_t`.
+  auto sizes = at::IntArrayRef(size.data_ptr<int64_t>(), size.numel());
+
+  const auto out = input.new_empty({input.size(0), other.size(-1)});
+
+  grouped_matmul_out_kernel(
+      input.split_with_sizes(/*split_size=*/sizes, /*dim=*/0),
+      other.split(/*split_size=*/1, /*dim=*/0),
+      out.split_with_sizes(/*split_size=*/sizes, /*dim=*/0));
+
+  return out;
+}
+
+}  // namespace
+
+TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("pyg::grouped_matmul"),
+         TORCH_FN(grouped_matmul_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("pyg::segment_matmul"),
+         TORCH_FN(segment_matmul_kernel));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/matmul.cpp b/pyg_lib/csrc/ops/matmul.cpp
@@ -0,0 +1,42 @@
+#include "matmul.h"
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace ops {
+
+// Performs matrix multiplication across list of elements.
+std::vector<at::Tensor> grouped_matmul(const std::vector<at::Tensor>& input,
+                                       const std::vector<at::Tensor>& other) {
+  // TODO (matthias) Add TensorArg definitions.
+  // TODO (matthias) Add autograd support.
+  // TODO (matthias) Add dispatcher support.
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::grouped_matmul", "")
+                       .typed<decltype(grouped_matmul)>();
+  return op.call(input, other);
+}
+
+// Performs matrix multiplication according to segments.
+at::Tensor segment_matmul(const at::Tensor& input,
+                          const at::Tensor& ptr,
+                          const at::Tensor& other) {
+  // TODO (matthias) Add TensorArg definitions.
+  // TODO (matthias) Add autograd support.
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("pyg::segment_matmul", "")
+                       .typed<decltype(segment_matmul)>();
+  return op.call(input, ptr, other);
+}
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "pyg::grouped_matmul(Tensor[] input, Tensor[] other) -> Tensor[]"));
+  m.def(
+      TORCH_SELECTIVE_SCHEMA("pyg::segment_matmul(Tensor input, Tensor ptr, "
+                             "Tensor other) -> Tensor"));
+}
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/csrc/ops/matmul.h b/pyg_lib/csrc/ops/matmul.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "pyg_lib/csrc/macros.h"
+
+namespace pyg {
+namespace ops {
+
+// Performs matrix multiplication across list of elements.
+// TODO (matthias) Import `out` argument.
+PYG_API std::vector<at::Tensor> grouped_matmul(
+    const std::vector<at::Tensor>& input,
+    const std::vector<at::Tensor>& other);
+
+// TODO (matthias) Import `out` argument.
+PYG_API at::Tensor segment_matmul(const at::Tensor& input,
+                                  const at::Tensor& ptr,
+                                  const at::Tensor& other);
+
+}  // namespace ops
+}  // namespace pyg
diff --git a/pyg_lib/ops/__init__.py b/pyg_lib/ops/__init__.py
@@ -0,0 +1,68 @@
+from typing import List
+
+import torch
+from torch import Tensor
+
+
+def grouped_matmul(inputs: List[Tensor], others: List[Tensor]) -> List[Tensor]:
+    r"""Performs dense-dense matrix multiplication according to groups,
+    utilizing dedicated kernels that effectively parallelize over groups.
+
+    .. code-block:: python
+        inputs = [torch.randn(5, 16), torch.randn(3, 32)]
+        others = [torch.randn(16, 32), torch.randn(32, 64)]
+
+        outs = pyg_lib.segment.grouped_matmul(inputs, others)
+        assert len(outs) == 2
+        assert outs[0].size() == (5, 32)
+        assert outs[0] == inputs[0] @ others[0]
+        assert outs[1].size() == (3, 64)
+        assert outs[1] == inputs[1] @ others[1]
+
+    Args:
+        inputs (List[torch.Tensor]): List of left operand 2D matrices of shapes
+            :obj:`[N_i, K_i]`.
+        others (List[torch.Tensor]): List of right operand 2D matrices of
+            shapes :obj:`[K_i, M_i]`.
+
+    Returns:
+        List[torch.Tensor]: List of 2D output matrices of shapes
+            :obj:`[N_i, M_i]`.
+    """
+    return torch.ops.pyg.grouped_matmul(inputs, others)
+
+
+def segment_matmul(inputs: Tensor, ptr: Tensor, other: Tensor) -> Tensor:
+    r"""Performs dense-dense matrix multiplication according to segments along
+    the first dimension of :obj:`inputs` as given by :obj:`ptr`, utilizing
+    dedicated kernels that effectively parallelize over groups.
+
+    .. code-block:: python
+        inputs = torch.randn(8, 16)
+        ptr = torch.tensor([0, 5, 8])
+        other = torch.randn(2, 16, 32)
+
+        out = pyg_lib.segment.segment_matmul(inputs, ptr, other)
+        assert out.size() == (8, 32)
+        assert out[0:5] == inputs[0:5] @ other[0]
+        assert out[5:8] == inputs[5:8] @ other[1]
+
+    Args:
+        input (torch.Tensor): The left operand 2D matrix of shape
+            :obj:`[N, K]`.
+        ptr (torch.Tensor): Compressed vector of shape :obj:`[B + 1]`, holding
+            the boundaries of segments.
+            For best performance, given as a CPU tensor.
+        other (torch.Tensor): The right operand 3D tensor of shape
+            :obj:`[B, K, M]`.
+
+    Returns:
+        torch.Tensor: The 2D output matrix of shape :obj:`[N, M]`.
+    """
+    return torch.ops.pyg.segment_matmul(inputs, ptr, other)
+
+
+__all__ = [
+    'grouped_matmul',
+    'segment_matmul',
+]
diff --git a/test/csrc/ops/test_matmul.cpp b/test/csrc/ops/test_matmul.cpp
@@ -0,0 +1,43 @@
+#include <ATen/ATen.h>
+#include <gtest/gtest.h>
+
+#include "pyg_lib/csrc/ops/matmul.h"
+
+#ifdef WITH_CUDA
+TEST(GroupedMatmulTest, BasicAssertions) {
+  // TODO (matthias) skip for now due to missing dispatcher support.
+  return;
+  auto options = at::TensorOptions().device(at::kCUDA);
+
+  std::vector<at::Tensor> input{at::randn({5, 8}, options),
+                                at::randn({3, 12}, options)};
+  std::vector<at::Tensor> other{at::randn({8, 16}, options),
+                                at::randn({12, 32}, options)};
+
+  auto out = pyg::ops::grouped_matmul(input, other);
+  EXPECT_EQ(out[0].size(0), 5);
+  EXPECT_EQ(out[0].size(1), 16);
+  EXPECT_EQ(out[1].size(0), 3);
+  EXPECT_EQ(out[1].size(1), 32);
+  EXPECT_TRUE(at::allclose(out[0], at::matmul(input[0], other[0]), 1e-01));
+  EXPECT_TRUE(at::allclose(out[1], at::matmul(input[1], other[1]), 1e-01));
+}
+#endif
+
+#ifdef WITH_CUDA
+TEST(SegmentMatmulTest, BasicAssertions) {
+  auto options = at::TensorOptions().device(at::kCUDA);
+
+  auto input = at::randn({8, 12}, options);
+  auto ptr = at::tensor({0, 5, 8}, options.dtype(at::kLong));
+  auto other = at::randn({2, 12, 16}, options);
+
+  auto out = pyg::ops::segment_matmul(input, ptr, other);
+  EXPECT_EQ(out.size(0), 8);
+  EXPECT_EQ(out.size(1), 16);
+  EXPECT_TRUE(at::allclose(out.narrow(0, 0, 5),
+                           at::matmul(input.narrow(0, 0, 5), other[0]), 1e-01));
+  EXPECT_TRUE(at::allclose(out.narrow(0, 5, 3),
+                           at::matmul(input.narrow(0, 5, 3), other[1]), 1e-01));
+}
+#endif
diff --git a/third_party/cutlass b/third_party/cutlass