From 01a91970c2e9ba9484d034d46e6cf5a54b789c0c Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Tue, 28 Jan 2025 11:49:23 +0000
Subject: [PATCH 1/6] update

---
 pyg_lib/csrc/classes/cuda/hash_map_impl.cu | 69 ++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 pyg_lib/csrc/classes/cuda/hash_map_impl.cu

diff --git a/pyg_lib/csrc/classes/cuda/hash_map_impl.cu b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
new file mode 100644
index 000000000..101cdaf05
--- /dev/null
+++ b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
@@ -0,0 +1,69 @@
+#include <ATen/ATen.h>
+#include <cuco/static_map.cuh>
+
+#include "../hash_map_impl.h"
+
+namespace pyg {
+namespace classes {
+
+namespace {
+
+template <typename KeyType>
+struct CUDAHashMapImpl : HashMapImpl {
+ public:
+  using ValueType = int64_t;
+
+  CUDAHashMapImpl(const at::Tensor& key) {
+    KeyType constexpr empty_key_sentinel = -1;  // TODO
+    ValueType constexpr empty_value_sentinel = -1;
+
+    map_ = std::make_unique<cuco::static_map<KeyType, ValueType>>(
+        2 * key.numel(),  // loader_factor = 0.5
+        cuco::empty_key{empty_key_sentinel},
+        cuco::empty_value{empty_value_sentinel});
+
+    const auto options =
+        at::TensorOptions().device(key.device()).dtype(at::kLong);
+    const auto value = at::arange(key.numel(), options);
+    const auto key_data = key.data_ptr<KeyType>();
+    const auto value_data = value.data_ptr<ValueType>();
+
+    map_->insert(key_data, value_data, key.numel());
+  }
+
+  at::Tensor get(const at::Tensor& query) override {
+    const auto options =
+        at::TensorOptions().device(query.device()).dtype(at::kLong);
+    const auto out = at::empty({query.numel()}, options);
+    const auto query_data = query.data_ptr<KeyType>();
+    auto out_data = out.data_ptr<int64_t>();
+
+    map_->find(query_data, out_data, query.numel());
+
+    return out;
+  }
+
+ private:
+  std::unique_ptr<cuco::static_map<KeyType, ValueType>> map_;
+};
+
+// template struct CUDAHashMapImpl<bool>;
+// template struct CUDAHashMapImpl<uint8_t>;
+// template struct CUDAHashMapImpl<int8_t>;
+// template struct CUDAHashMapImpl<int16_t>;
+// template struct CUDAHashMapImpl<int32_t>;
+// template struct CUDAHashMapImpl<int64_t>;
+// template struct CUDAHashMapImpl<float>;
+// template struct CUDAHashMapImpl<double>;
+
+struct CUDAHashMap : torch::CustomClassHolder {
+ public:
+  CUDAHashMap(const at::Tensor& key) {}
+
+  at::Tensor get(const at::Tensor& query) { return query; }
+};
+
+}  // namespace
+
+}  // namespace classes
+}  // namespace pyg

From a3f446c61b0b35993d1ff5e5385ac9760f95176b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 29 Jan 2025 14:48:17 +0000
Subject: [PATCH 2/6] update

---
 pyg_lib/csrc/classes/cuda/hash_map_impl.cu | 104 ++++++++++++++++-----
 test/csrc/classes/test_hash_map.cpp        |   1 +
 2 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/pyg_lib/csrc/classes/cuda/hash_map_impl.cu b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
index 101cdaf05..55232db6a 100644
--- a/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
+++ b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
@@ -1,69 +1,131 @@
 #include <ATen/ATen.h>
+#include <torch/library.h>
 #include <cuco/static_map.cuh>
-
-#include "../hash_map_impl.h"
+#include <limits>
 
 namespace pyg {
 namespace classes {
 
 namespace {
 
+#define DISPATCH_CASE_KEY(...)                         \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define DISPATCH_KEY(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_KEY(__VA_ARGS__))
+
+struct HashMapImpl {
+  virtual ~HashMapImpl() = default;
+  virtual at::Tensor get(const at::Tensor& query) = 0;
+  virtual at::Tensor keys() = 0;
+};
+
 template <typename KeyType>
 struct CUDAHashMapImpl : HashMapImpl {
  public:
   using ValueType = int64_t;
 
   CUDAHashMapImpl(const at::Tensor& key) {
-    KeyType constexpr empty_key_sentinel = -1;  // TODO
+    KeyType constexpr empty_key_sentinel = std::numeric_limits<KeyType>::min();
     ValueType constexpr empty_value_sentinel = -1;
 
     map_ = std::make_unique<cuco::static_map<KeyType, ValueType>>(
-        2 * key.numel(),  // loader_factor = 0.5
+        2 * key.numel(),  // load_factor = 0.5
         cuco::empty_key{empty_key_sentinel},
         cuco::empty_value{empty_value_sentinel});
 
+    const auto key_data = key.data_ptr<KeyType>();
     const auto options =
-        at::TensorOptions().device(key.device()).dtype(at::kLong);
+        key.options().dtype(c10::CppTypeToScalarType<ValueType>::value);
     const auto value = at::arange(key.numel(), options);
-    const auto key_data = key.data_ptr<KeyType>();
     const auto value_data = value.data_ptr<ValueType>();
+    const auto zipped =
+        thrust::make_zip_iterator(thrust::make_tuple(key_data, value_data));
 
-    map_->insert(key_data, value_data, key.numel());
+    map_->insert(zipped, zipped + key.numel());
   }
 
   at::Tensor get(const at::Tensor& query) override {
     const auto options =
-        at::TensorOptions().device(query.device()).dtype(at::kLong);
+        query.options().dtype(c10::CppTypeToScalarType<ValueType>::value);
     const auto out = at::empty({query.numel()}, options);
     const auto query_data = query.data_ptr<KeyType>();
-    auto out_data = out.data_ptr<int64_t>();
+    auto out_data = out.data_ptr<ValueType>();
 
-    map_->find(query_data, out_data, query.numel());
+    map_->find(query_data, query_data + query.numel(), out_data);
 
     return out;
   }
 
+  at::Tensor keys() override {
+    // TODO This will not work in multi-GPU scenarios.
+    const auto options = at::TensorOptions().device(at::DeviceType::CUDA);
+    const auto size = static_cast<int64_t>(map_->size());
+    const auto key = at::empty(
+        {size}, options.dtype(c10::CppTypeToScalarType<KeyType>::value));
+    const auto value = at::empty(
+        {size}, options.dtype(c10::CppTypeToScalarType<ValueType>::value));
+    auto key_data = key.data_ptr<KeyType>();
+    auto value_data = value.data_ptr<ValueType>();
+
+    map_->retrieve_all(key_data, value_data);
+
+    return key.index_select(0, value.argsort());
+  }
+
  private:
   std::unique_ptr<cuco::static_map<KeyType, ValueType>> map_;
 };
 
-// template struct CUDAHashMapImpl<bool>;
-// template struct CUDAHashMapImpl<uint8_t>;
-// template struct CUDAHashMapImpl<int8_t>;
-// template struct CUDAHashMapImpl<int16_t>;
-// template struct CUDAHashMapImpl<int32_t>;
-// template struct CUDAHashMapImpl<int64_t>;
-// template struct CUDAHashMapImpl<float>;
-// template struct CUDAHashMapImpl<double>;
-
 struct CUDAHashMap : torch::CustomClassHolder {
  public:
-  CUDAHashMap(const at::Tensor& key) {}
+  CUDAHashMap(const at::Tensor& key) {
+    at::TensorArg key_arg{key, "key", 0};
+    at::CheckedFrom c{"CUDAHashMap.init"};
+    at::checkDeviceType(c, key, at::DeviceType::CUDA);
+    at::checkDim(c, key_arg, 1);
+    at::checkContiguous(c, key_arg);
+
+    DISPATCH_KEY(key.scalar_type(), "cuda_hash_map_init", [&] {
+      map_ = std::make_unique<CUDAHashMapImpl<scalar_t>>(key);
+    });
+  }
 
-  at::Tensor get(const at::Tensor& query) { return query; }
+  at::Tensor get(const at::Tensor& query) {
+    at::TensorArg query_arg{query, "query", 0};
+    at::CheckedFrom c{"CUDAHashMap.get"};
+    at::checkDeviceType(c, query, at::DeviceType::CUDA);
+    at::checkDim(c, query_arg, 1);
+    at::checkContiguous(c, query_arg);
+
+    return map_->get(query);
+  }
+
+  at::Tensor keys() { return map_->keys(); }
+
+ private:
+  std::unique_ptr<HashMapImpl> map_;
 };
 
 }  // namespace
 
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.class_<CUDAHashMap>("CUDAHashMap")
+      .def(torch::init<at::Tensor&>())
+      .def("get", &CUDAHashMap::get)
+      .def("keys", &CUDAHashMap::keys)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<CUDAHashMap>& self) -> at::Tensor {
+            return self->keys();
+          },
+          // __setstate__
+          [](const at::Tensor& state) -> c10::intrusive_ptr<CUDAHashMap> {
+            return c10::make_intrusive<CUDAHashMap>(state);
+          });
+}
+
 }  // namespace classes
 }  // namespace pyg
diff --git a/test/csrc/classes/test_hash_map.cpp b/test/csrc/classes/test_hash_map.cpp
index 813fbe46a..a0109332d 100644
--- a/test/csrc/classes/test_hash_map.cpp
+++ b/test/csrc/classes/test_hash_map.cpp
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <gtest/gtest.h>
+#include <limits>
 
 #include "pyg_lib/csrc/classes/hash_map.h"
 

From ba3fc7d2e1ab98560a29c5ed9045d3d20830391c Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 29 Jan 2025 16:16:16 +0000
Subject: [PATCH 3/6] update

---
 test/csrc/classes/test_hash_map.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/csrc/classes/test_hash_map.cpp b/test/csrc/classes/test_hash_map.cpp
index a0109332d..813fbe46a 100644
--- a/test/csrc/classes/test_hash_map.cpp
+++ b/test/csrc/classes/test_hash_map.cpp
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <gtest/gtest.h>
-#include <limits>
 
 #include "pyg_lib/csrc/classes/hash_map.h"
 

From 2fc606c1cf4d35cdd25dbfa6ce4a9a3dd26c85d2 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 1 Feb 2025 20:09:41 +0000
Subject: [PATCH 4/6] update

---
 pyg_lib/csrc/classes/cuda/hash_map_impl.cu | 24 ++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pyg_lib/csrc/classes/cuda/hash_map_impl.cu b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
index 55232db6a..a8a2faa64 100644
--- a/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
+++ b/pyg_lib/csrc/classes/cuda/hash_map_impl.cu
@@ -52,7 +52,7 @@ struct CUDAHashMapImpl : HashMapImpl {
         query.options().dtype(c10::CppTypeToScalarType<ValueType>::value);
     const auto out = at::empty({query.numel()}, options);
     const auto query_data = query.data_ptr<KeyType>();
-    auto out_data = out.data_ptr<ValueType>();
+    const auto out_data = out.data_ptr<ValueType>();
 
     map_->find(query_data, query_data + query.numel(), out_data);
 
@@ -61,14 +61,22 @@ struct CUDAHashMapImpl : HashMapImpl {
 
   at::Tensor keys() override {
     // TODO This will not work in multi-GPU scenarios.
-    const auto options = at::TensorOptions().device(at::DeviceType::CUDA);
+    const auto options = at::TensorOptions()
+                             .device(at::DeviceType::CUDA)
+                             .dtype(c10::CppTypeToScalarType<ValueType>::value);
     const auto size = static_cast<int64_t>(map_->size());
-    const auto key = at::empty(
-        {size}, options.dtype(c10::CppTypeToScalarType<KeyType>::value));
-    const auto value = at::empty(
-        {size}, options.dtype(c10::CppTypeToScalarType<ValueType>::value));
-    auto key_data = key.data_ptr<KeyType>();
-    auto value_data = value.data_ptr<ValueType>();
+
+    at::Tensor key;
+    if (std::is_same<KeyType, int16_t>::value) {
+      key = at::empty({size}, options.dtype(at::kShort));
+    } else if (std::is_same<KeyType, int32_t>::value) {
+      key = at::empty({size}, options.dtype(at::kInt));
+    } else {
+      key = at::empty({size}, options);
+    }
+    const auto value = at::empty({size}, options);
+    const auto key_data = key.data_ptr<KeyType>();
+    const auto value_data = value.data_ptr<ValueType>();
 
     map_->retrieve_all(key_data, value_data);
 

From e658af1b9cdf233c0b7d685268706d5004ff690d Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 1 Feb 2025 20:37:22 +0000
Subject: [PATCH 5/6] update

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea26fefc3..5387aec43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.18)
 project(pyg)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -43,6 +43,7 @@ if(WITH_CUDA)
   enable_language(CUDA)
   add_definitions(-DWITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -allow-unsupported-compiler")
+  set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;90")
 
   if (NOT "$ENV{EXTERNAL_CUTLASS_INCLUDE_DIR}" STREQUAL "")
     include_directories($ENV{EXTERNAL_CUTLASS_INCLUDE_DIR})

From 08039b9ad852d8fa761cf3bbf4d9d9c0342b68ec Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 1 Feb 2025 20:39:19 +0000
Subject: [PATCH 6/6] update

---
 .github/workflows/cuda/Linux-env.sh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cuda/Linux-env.sh b/.github/workflows/cuda/Linux-env.sh
index f519b347e..d048e6065 100755
--- a/.github/workflows/cuda/Linux-env.sh
+++ b/.github/workflows/cuda/Linux-env.sh
@@ -4,42 +4,42 @@ case ${1} in
   cu124)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-12.4/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6;9.0"
     ;;
   cu121)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-12.1/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6;9.0"
     ;;
   cu118)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-11.8/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6;9.0"
     ;;
   cu117)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-11.7/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6"
     ;;
   cu116)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-11.6/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6"
     ;;
   cu115)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-11.5/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6"
     ;;
   cu113)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-11.3/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5;8.0;8.6"
     ;;
   cu102)
     export FORCE_CUDA=1
     export PATH=/usr/local/cuda-10.2/bin:${PATH}
-    export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
+    export TORCH_CUDA_ARCH_LIST="6.0+PTX;7.0;7.5"
     ;;
   *)
     ;;