Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/cutlass"]
path = third_party/cutlass
url = https://github.com/NVIDIA/cutlass
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [Unreleased]
### Added
- Added `grouped_matmul` and `segment_matmul` CUDA implementations via `cutlass` ([#51](https://github.com/pyg-team/pyg-lib/pull/51)
- Added `pyg::sampler::neighbor_sample` interface ([#54](https://github.com/pyg-team/pyg-lib/pull/54)
- Added `pyg::sampler::Mapper` utility for mapping global to local node indices ([#45](https://github.com/pyg-team/pyg-lib/pull/45)
- Added benchmark script ([#45](https://github.com/pyg-team/pyg-lib/pull/45)
Expand Down
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ if(WITH_CUDA)
enable_language(CUDA)
add_definitions(-DWITH_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")

set(CUTLASS_DIR third_party/cutlass/include)
include_directories(${CUTLASS_DIR})
set(CUTLASS_UTIL_DIR third_party/cutlass/tools/util/include)
include_directories(${CUTLASS_UTIL_DIR})
endif()

set(CSRC pyg_lib/csrc)
Expand Down
158 changes: 158 additions & 0 deletions pyg_lib/csrc/ops/cuda/matmul_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/library.h>

#include <cutlass/gemm/device/gemm_grouped.h>
#include <cutlass/gemm/kernel/default_gemm_grouped.h>
#include <cutlass/util/host_tensor.h>

namespace pyg {
namespace ops {

namespace {

void grouped_matmul_out_kernel(const std::vector<at::Tensor>& input,
const std::vector<at::Tensor>& other,
const std::vector<at::Tensor>& out) {
// TODO (matthias) Check tensor devices.
// TODO (matthias) Check for contiguous memory.

// TODO (matthias) Allow for other types than `float`.
// TODO (matthias) Are these attributes correctly set?
using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
float, // Element A
cutlass::layout::RowMajor, // Layout A
cutlass::ComplexTransform::kNone, //
8, // Granularity A
float, // Element B
cutlass::layout::RowMajor, // Layout B
cutlass::ComplexTransform::kNone, //
8, // Granularity B
float, // Element C&D
cutlass::layout::RowMajor, // Layout C&D
float, // Element Accumulator
cutlass::arch::OpClassTensorOp, // Operator Class Tag
cutlass::arch::Sm80, // Architecture
cutlass::gemm::GemmShape<256, 128, 32>, // Threadblock-level Tile
cutlass::gemm::GemmShape<64, 64, 32>, // Warp-level Tile
cutlass::gemm::GemmShape<16, 8, 8>, // Warp-level Tile
cutlass::epilogue::thread::LinearCombination< // Epilogue
float, 8, float, float>, //
cutlass::gemm::threadblock:: // Swizzling Operator
GemmIdentityThreadblockSwizzle<8>, //
2, // Stages
cutlass::arch::OpMultiplyAdd // Operation
>::GemmKernel;

auto num_matrices = input.size();

std::vector<float*> ptr_A_host(num_matrices);
std::vector<float*> ptr_B_host(num_matrices);
std::vector<float*> ptr_C_host(num_matrices);

for (size_t i = 0; i < num_matrices; ++i) {
ptr_A_host[i] = input[i].data_ptr<float>();
ptr_B_host[i] = other[i].data_ptr<float>();
ptr_C_host[i] = out[i].data_ptr<float>();
}

cutlass::DeviceAllocation<float*> ptr_A;
ptr_A.reset(num_matrices);
ptr_A.copy_from_host(ptr_A_host.data());

cutlass::DeviceAllocation<float*> ptr_B;
ptr_B.reset(num_matrices);
ptr_B.copy_from_host(ptr_B_host.data());

cutlass::DeviceAllocation<float*> ptr_C;
ptr_C.reset(num_matrices);
ptr_C.copy_from_host(ptr_C_host.data());

std::vector<cutlass::gemm::GemmCoord> all_problems(num_matrices);
std::vector<int64_t> ld_A_host(num_matrices);
std::vector<int64_t> ld_B_host(num_matrices);
std::vector<int64_t> ld_C_host(num_matrices);

for (size_t i = 0; i < num_matrices; ++i) {
auto m = input[i].size(0), k = input[i].size(1), n = out[i].size(1);
TORCH_CHECK(input[i].size(-1) == other[i].size(-2), "Shape mismatch");
all_problems[i] = cutlass::gemm::GemmCoord(m, n, k);
ld_A_host[i] = GemmKernel::LayoutA::packed({m, k}).stride(0);
ld_B_host[i] = GemmKernel::LayoutB::packed({k, n}).stride(0);
ld_C_host[i] = GemmKernel::LayoutC::packed({m, n}).stride(0);
}

cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> all_problems_device;
all_problems_device.reset(num_matrices);
all_problems_device.copy_from_host(all_problems.data());

cutlass::DeviceAllocation<int64_t> ld_A;
ld_A.reset(num_matrices);
ld_A.copy_from_host(ld_A_host.data());

cutlass::DeviceAllocation<int64_t> ld_B;
ld_B.reset(num_matrices);
ld_B.copy_from_host(ld_B_host.data());

cutlass::DeviceAllocation<int64_t> ld_C;
ld_C.reset(num_matrices);
ld_C.copy_from_host(ld_C_host.data());

using EpilogueOutputOp = typename GemmKernel::Epilogue::OutputOp;
typename EpilogueOutputOp::Params epilogue_op(1.0, 0.0);

using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
typename GemmGrouped::Arguments args(
all_problems_device.get(), num_matrices, /*threadblock_count=*/1024,
epilogue_op, ptr_A.get(), ptr_B.get(), ptr_C.get(), ptr_C.get(),
ld_A.get(), ld_B.get(), ld_C.get(), ld_C.get());

GemmGrouped gemm;
auto status = gemm.initialize(args);
TORCH_CHECK(status == cutlass::Status::kSuccess, "GroupedGEMM init failed");
status = gemm.run();
TORCH_CHECK(status == cutlass::Status::kSuccess, "GroupedGEMM run failed");
}

std::vector<at::Tensor> grouped_matmul_kernel(
const std::vector<at::Tensor>& input,
const std::vector<at::Tensor>& other) {
std::vector<at::Tensor> out(input.size());
for (size_t i = 0; i < input.size(); ++i)
out[i] = input[i].new_empty({input[i].size(0), other[i].size(-1)});

grouped_matmul_out_kernel(input, other, out);

return out;
}

at::Tensor segment_matmul_kernel(const at::Tensor& input,
const at::Tensor& ptr,
const at::Tensor& other) {
auto size = ptr.narrow(/*dim=*/0, /*start=*/1, /*length=*/ptr.numel() - 1) -
ptr.narrow(/*dim=*/0, /*start=*/0, /*length=*/ptr.numel() - 1);
size = size.cpu(); // `at::split` requires CPU-allocated data.
// TODO (matthias) Allow for other types than `int64_t`.
auto sizes = at::IntArrayRef(size.data_ptr<int64_t>(), size.numel());

const auto out = input.new_empty({input.size(0), other.size(-1)});

grouped_matmul_out_kernel(
input.split_with_sizes(/*split_size=*/sizes, /*dim=*/0),
other.split(/*split_size=*/1, /*dim=*/0),
out.split_with_sizes(/*split_size=*/sizes, /*dim=*/0));

return out;
}

} // namespace

TORCH_LIBRARY_IMPL(pyg, CUDA, m) {
m.impl(TORCH_SELECTIVE_NAME("pyg::grouped_matmul"),
TORCH_FN(grouped_matmul_kernel));
m.impl(TORCH_SELECTIVE_NAME("pyg::segment_matmul"),
TORCH_FN(segment_matmul_kernel));
}

} // namespace ops
} // namespace pyg
42 changes: 42 additions & 0 deletions pyg_lib/csrc/ops/matmul.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "matmul.h"

#include <ATen/core/dispatch/Dispatcher.h>
#include <torch/library.h>

namespace pyg {
namespace ops {

// Performs matrix multiplication across list of elements.
std::vector<at::Tensor> grouped_matmul(const std::vector<at::Tensor>& input,
const std::vector<at::Tensor>& other) {
// TODO (matthias) Add TensorArg definitions.
// TODO (matthias) Add autograd support.
// TODO (matthias) Add dispatcher support.
static auto op = c10::Dispatcher::singleton()
.findSchemaOrThrow("pyg::grouped_matmul", "")
.typed<decltype(grouped_matmul)>();
return op.call(input, other);
}

// Performs matrix multiplication according to segments.
at::Tensor segment_matmul(const at::Tensor& input,
const at::Tensor& ptr,
const at::Tensor& other) {
// TODO (matthias) Add TensorArg definitions.
// TODO (matthias) Add autograd support.
static auto op = c10::Dispatcher::singleton()
.findSchemaOrThrow("pyg::segment_matmul", "")
.typed<decltype(segment_matmul)>();
return op.call(input, ptr, other);
}

TORCH_LIBRARY_FRAGMENT(pyg, m) {
m.def(TORCH_SELECTIVE_SCHEMA(
"pyg::grouped_matmul(Tensor[] input, Tensor[] other) -> Tensor[]"));
m.def(
TORCH_SELECTIVE_SCHEMA("pyg::segment_matmul(Tensor input, Tensor ptr, "
"Tensor other) -> Tensor"));
}

} // namespace ops
} // namespace pyg
21 changes: 21 additions & 0 deletions pyg_lib/csrc/ops/matmul.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <ATen/ATen.h>
#include "pyg_lib/csrc/macros.h"

namespace pyg {
namespace ops {

// Performs matrix multiplication across list of elements.
// TODO (matthias) Import `out` argument.
PYG_API std::vector<at::Tensor> grouped_matmul(
const std::vector<at::Tensor>& input,
const std::vector<at::Tensor>& other);

// TODO (matthias) Import `out` argument.
PYG_API at::Tensor segment_matmul(const at::Tensor& input,
const at::Tensor& ptr,
const at::Tensor& other);

} // namespace ops
} // namespace pyg
68 changes: 68 additions & 0 deletions pyg_lib/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from typing import List

import torch
from torch import Tensor


def grouped_matmul(inputs: List[Tensor], others: List[Tensor]) -> List[Tensor]:
r"""Performs dense-dense matrix multiplication according to groups,
utilizing dedicated kernels that effectively parallelize over groups.

.. code-block:: python
inputs = [torch.randn(5, 16), torch.randn(3, 32)]
others = [torch.randn(16, 32), torch.randn(32, 64)]

outs = pyg_lib.segment.grouped_matmul(inputs, others)
assert len(outs) == 2
assert outs[0].size() == (5, 32)
assert outs[0] == inputs[0] @ others[0]
assert outs[1].size() == (3, 64)
assert outs[1] == inputs[1] @ others[1]

Args:
inputs (List[torch.Tensor]): List of left operand 2D matrices of shapes
:obj:`[N_i, K_i]`.
others (List[torch.Tensor]): List of right operand 2D matrices of
shapes :obj:`[K_i, M_i]`.

Returns:
List[torch.Tensor]: List of 2D output matrices of shapes
:obj:`[N_i, M_i]`.
"""
return torch.ops.pyg.grouped_matmul(inputs, others)


def segment_matmul(inputs: Tensor, ptr: Tensor, other: Tensor) -> Tensor:
r"""Performs dense-dense matrix multiplication according to segments along
the first dimension of :obj:`inputs` as given by :obj:`ptr`, utilizing
dedicated kernels that effectively parallelize over groups.

.. code-block:: python
inputs = torch.randn(8, 16)
ptr = torch.tensor([0, 5, 8])
other = torch.randn(2, 16, 32)

out = pyg_lib.segment.segment_matmul(inputs, ptr, other)
assert out.size() == (8, 32)
assert out[0:5] == inputs[0:5] @ other[0]
assert out[5:8] == inputs[5:8] @ other[1]

Args:
input (torch.Tensor): The left operand 2D matrix of shape
:obj:`[N, K]`.
ptr (torch.Tensor): Compressed vector of shape :obj:`[B + 1]`, holding
the boundaries of segments.
For best performance, given as a CPU tensor.
other (torch.Tensor): The right operand 3D tensor of shape
:obj:`[B, K, M]`.

Returns:
torch.Tensor: The 2D output matrix of shape :obj:`[N, M]`.
"""
return torch.ops.pyg.segment_matmul(inputs, ptr, other)


__all__ = [
'grouped_matmul',
'segment_matmul',
]
43 changes: 43 additions & 0 deletions test/csrc/ops/test_matmul.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <ATen/ATen.h>
#include <gtest/gtest.h>

#include "pyg_lib/csrc/ops/matmul.h"

#ifdef WITH_CUDA
TEST(GroupedMatmulTest, BasicAssertions) {
// TODO (matthias) skip for now due to missing dispatcher support.
return;
auto options = at::TensorOptions().device(at::kCUDA);

std::vector<at::Tensor> input{at::randn({5, 8}, options),
at::randn({3, 12}, options)};
std::vector<at::Tensor> other{at::randn({8, 16}, options),
at::randn({12, 32}, options)};

auto out = pyg::ops::grouped_matmul(input, other);
EXPECT_EQ(out[0].size(0), 5);
EXPECT_EQ(out[0].size(1), 16);
EXPECT_EQ(out[1].size(0), 3);
EXPECT_EQ(out[1].size(1), 32);
EXPECT_TRUE(at::allclose(out[0], at::matmul(input[0], other[0]), 1e-01));
EXPECT_TRUE(at::allclose(out[1], at::matmul(input[1], other[1]), 1e-01));
}
#endif

#ifdef WITH_CUDA
TEST(SegmentMatmulTest, BasicAssertions) {
auto options = at::TensorOptions().device(at::kCUDA);

auto input = at::randn({8, 12}, options);
auto ptr = at::tensor({0, 5, 8}, options.dtype(at::kLong));
auto other = at::randn({2, 12, 16}, options);

auto out = pyg::ops::segment_matmul(input, ptr, other);
EXPECT_EQ(out.size(0), 8);
EXPECT_EQ(out.size(1), 16);
EXPECT_TRUE(at::allclose(out.narrow(0, 0, 5),
at::matmul(input.narrow(0, 0, 5), other[0]), 1e-01));
EXPECT_TRUE(at::allclose(out.narrow(0, 5, 3),
at::matmul(input.narrow(0, 5, 3), other[1]), 1e-01));
}
#endif
1 change: 1 addition & 0 deletions third_party/cutlass
Submodule cutlass added at 858c73