64-bit indexing Adam (#1765)

eqy · web-flow · commit 87c4debde800 · 2024-01-05T17:03:05.000+09:00
* all i want for christmas is larger binaries and longer compile times

* actually compare

* woops
diff --git a/csrc/multi_tensor_adam.cu b/csrc/multi_tensor_adam.cu
@@ -20,11 +20,11 @@ typedef enum{
 
 using MATH_T = float;
 
-template<typename T, typename FULL_T>
+template<typename T, typename FULL_T, typename index_t>
 struct AdamFunctor
 {
    __device__ __forceinline__ void operator()(
-    int chunk_size,
+    index_t chunk_size,
     volatile int* noop_gmem,
     TensorListMetadata<4>& tl,
     const float beta1,
@@ -40,13 +40,13 @@ struct AdamFunctor
     // if(*noop_gmem == 1)
     //   return;
 
-    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    index_t tensor_loc = tl.block_to_tensor[blockIdx.x];
 
     // potentially use to pass in list of scalar
     // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
 
-    int chunk_idx = tl.block_to_chunk[blockIdx.x];
-    int n = tl.sizes[tensor_loc];
+    index_t chunk_idx = tl.block_to_chunk[blockIdx.x];
+    index_t n = tl.sizes[tensor_loc];
 
     T* g = (T*)tl.addresses[0][tensor_loc];
     g += chunk_idx*chunk_size;
@@ -63,7 +63,7 @@ struct AdamFunctor
     n -= chunk_idx*chunk_size;
 
     // see note in multi_tensor_scale_kernel.cu
-    for(int i_start = 0;
+    for(index_t i_start = 0;
             i_start < n && i_start < chunk_size;
             i_start += blockDim.x*ILP)
     {
@@ -378,26 +378,61 @@ void multi_tensor_adam_cuda(
     bias_correction2 = 1 - std::pow(beta2, step);
   }
 
-  // Assume single type across p,g,m1,m2 now
-  DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
-    tensor_lists[0][0].scalar_type(), 0, "adam",
-    multi_tensor_apply<4>(
-      BLOCK_SIZE,
-      chunk_size,
-      noop_flag,
-      tensor_lists,
-      AdamFunctor<scalar_t_0, float>(),
-      beta1,
-      beta2,
-      bias_correction1,
-      bias_correction2,
-      epsilon,
-      lr,
-      (adamMode_t) mode,
-      weight_decay); )
+  size_t max_size = 0;
+  bool requires_64bit_indexing = false;
+  for (auto it = tensor_lists.begin(); it != tensor_lists.end(); it++) {
+    for (auto it2 = it->begin(); it2 != it->end(); it2++) {
+      if (it2->numel() > max_size) {
+        max_size = it2->numel();
+	if (max_size >= INT_MAX) {
+          requires_64bit_indexing = true;
+	  break;
+        }
+      }
+    }
+    if (requires_64bit_indexing) {
+      break;
+    }
+  }
 
+  if (requires_64bit_indexing) {
+    // Assume single type across p,g,m1,m2 now
+    DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
+      tensor_lists[0][0].scalar_type(), 0, "adam",
+      multi_tensor_apply<4>(
+        (int64_t) BLOCK_SIZE,
+        (int64_t) chunk_size,
+        noop_flag,
+        tensor_lists,
+        AdamFunctor<scalar_t_0, float, int64_t>(),
+        beta1,
+        beta2,
+        bias_correction1,
+        bias_correction2,
+        epsilon,
+        lr,
+        (adamMode_t) mode,
+        weight_decay); )
+  } else {
+      // Assume single type across p,g,m1,m2 now
+      DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
+        tensor_lists[0][0].scalar_type(), 0, "adam",
+        multi_tensor_apply<4>(
+          BLOCK_SIZE,
+          chunk_size,
+          noop_flag,
+          tensor_lists,
+          AdamFunctor<scalar_t_0, float, int32_t>(),
+          beta1,
+          beta2,
+          bias_correction1,
+          bias_correction2,
+          epsilon,
+          lr,
+          (adamMode_t) mode,
+          weight_decay); )
+  }
   AT_CUDA_CHECK(cudaGetLastError());
-
 }
 
 void multi_tensor_adam_capturable_cuda(
diff --git a/csrc/multi_tensor_apply.cuh b/csrc/multi_tensor_apply.cuh
@@ -28,7 +28,7 @@ template<int n> struct TensorListMetadata
 
 template<typename T, typename U, typename... ArgTypes>
 __global__ void multi_tensor_apply_kernel(
-    int chunk_size,
+    int64_t chunk_size,
     volatile int* noop_flag,
     T tl,
     U callable,
@@ -40,8 +40,8 @@ __global__ void multi_tensor_apply_kernel(
 
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
-  int block_size,
-  int chunk_size,
+  int64_t block_size,
+  int64_t chunk_size,
   const at::Tensor& noop_flag,
   const std::vector<std::vector<at::Tensor>>& tensor_lists,
   T callable,
@@ -85,9 +85,9 @@ void multi_tensor_apply(
       tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
     loc_tensor_info++;
 
-    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+    auto chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
 
-    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    for(auto chunk = 0; chunk < chunks_this_tensor; chunk++)
     {
       // std::cout << chunks_this_tensor << std::endl;
       tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
diff --git a/tests/L0/run_optimizers/test_adam.py b/tests/L0/run_optimizers/test_adam.py
@@ -232,6 +232,20 @@ def testNative(self):
             
             self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))
 
+    def testLargeTensor(self):
+        t = torch.zeros(2359332864, dtype=torch.half, device='cuda')
+        t2 = torch.zeros(2359332864, dtype=torch.half, device='cuda')
+        grad = torch.randn_like(t)
+        t.grad = grad
+        t2.grad = grad
+        params = [t]
+        params2 = [t2]
+        optimizer = apex.optimizers.FusedAdam(params, lr=self.lr)
+        optimizer.step()
+        optimizer2 = torch.optim.Adam(params2, lr=self.lr)
+        torch.testing.assert_close(t, t2)
+        torch.cuda.synchronize()
+
 
 if __name__ == '__main__':
     unittest.main()