Add multi_tensor_unscale_l2norm_cuda (#1727)

minitu · web-flow · commit 741bdf50825a · 2023-09-19T10:24:35.000-07:00
* Add multi_tensor_scale_l2norm

* Rename

* Add unit test

* Fix unit test

---------

Co-authored-by: Jaemin Choi &lt;jaeminc@nvidia.com&gt;
diff --git a/csrc/amp_C_frontend.cpp b/csrc/amp_C_frontend.cpp
@@ -46,6 +46,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_scale_cuda(
   float scale,
   at::optional<bool> per_tensor_python);
 
+std::tuple<at::Tensor, at::Tensor> multi_tensor_unscale_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor inv_scale,
+  at::optional<bool> per_tensor_python);
+
 void multi_tensor_lamb_stage1_cuda(
     int chunk_size,
     at::Tensor noop_flag,
@@ -184,6 +191,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "Computes L2 norm for a list of contiguous tensors");
   m.def("multi_tensor_l2norm_scale", &multi_tensor_l2norm_scale_cuda,
         "Computes L2 norm for a list of contiguous tensors and does scaling");
+  m.def("multi_tensor_unscale_l2norm", &multi_tensor_unscale_l2norm_cuda,
+        "Computes L2 norm for a list of contiguous tensors after unscaling (unscaling is only performed for L2 norm computation, and tensors are not updated)");
   m.def("multi_tensor_lamb_stage1_cuda", &multi_tensor_lamb_stage1_cuda,
         "Computes update part of LAMB optimizer");
   m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda,
diff --git a/csrc/multi_tensor_l2norm_kernel.cu b/csrc/multi_tensor_l2norm_kernel.cu
@@ -109,6 +109,91 @@ struct L2NormFunctor
   }
 };
 
+template<typename x_t>
+struct UnscaleL2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<1>& tl,
+    const float* inv_scale,
+    float* output,
+    float* output_per_tensor,
+    bool per_tensor,
+    int max_chunks_per_tensor)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    x_t r_x[ILP];
+    for(int i = 0; i < ILP; i++)
+    {
+      vals[i] = 0.f;
+      r_x[i] = 0;
+    }
+
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]) * (*inv_scale);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]) * (*inv_scale);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float final = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      if(!isfinite(final))
+        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+      output[blockIdx.x] += final;
+      if(per_tensor)
+        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
+    }
+  }
+};
+
 // Probably better to template, but since we are not likely to support other norm
 template<typename x_t>
 struct MaxNormFunctor
@@ -355,6 +440,73 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
   return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
 }
 
+std::tuple<at::Tensor, at::Tensor> multi_tensor_unscale_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor inv_scale,
+  at::optional<bool> per_tensor_python)
+{
+  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;
+
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  at::Tensor output_per_tensor;
+  at::Tensor ret_per_tensor;
+
+  int ntensors = tensor_lists[0].size();
+  int max_chunks_per_tensor = -1;
+
+  if(per_tensor)
+  {
+    for(int t = 0; t < ntensors; t++)
+    {
+      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+      if(max_chunks_this_tensor > max_chunks_per_tensor)
+        max_chunks_per_tensor = max_chunks_this_tensor;
+    }
+    output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
+    ret_per_tensor = at::empty({ntensors}, float_options);
+  }
+  else
+  {
+    ret_per_tensor = at::empty({0}, float_options);
+  }
+
+  DISPATCH_FLOAT_HALF_AND_BFLOAT(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_unscale_l2norm_cuda",
+    multi_tensor_apply<1>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      UnscaleL2NormFunctor<scalar_t_0>(),
+      inv_scale.DATA_PTR<float>(),
+      output.DATA_PTR<float>(),
+      per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
+      per_tensor,
+      max_chunks_per_tensor);)
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+
+  // This involves one more small kernel launches, but will be negligible end to end.
+  // I could get rid of these by hacking the functor + multi tensor harness with persistence
+  // logic, but keeping it simple for now
+  auto ret = at::empty({1}, output.options());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
+    output.DATA_PTR<float>(),
+    per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
+    ret.DATA_PTR<float>(),
+    per_tensor ? ret_per_tensor.DATA_PTR<float>() : nullptr,
+    per_tensor,
+    max_chunks_per_tensor);
+
+  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
+}
+
 
 // Compute and update grad norm
 // Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
diff --git a/tests/L0/run_amp/test_multi_tensor_unscale_l2norm.py b/tests/L0/run_amp/test_multi_tensor_unscale_l2norm.py
@@ -0,0 +1,89 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+try:
+  import amp_C
+  from amp_C import multi_tensor_unscale_l2norm
+  from apex.multi_tensor_apply import MultiTensorApply
+  disabled = False
+except ImportError as err:
+  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
+  disabled = True
+
+
+class TestMultiTensorUnscaleL2Norm(unittest.TestCase):
+
+    def setUp(self):
+        common_init(self)
+        self.val = 4.0
+        self.inv_scale = 0.5
+        self.inv_scale_cuda = torch.tensor([self.inv_scale], dtype=torch.float32, device='cuda')
+        self.overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
+
+    def tearDown(self):
+        pass
+
+    # The tensor creation here is written for convenience, not speed.
+    def unscale_l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_tensor):
+        self.overflow_buf.zero_()
+        a = torch.full([sizea], self.val, dtype=torch.float32, device='cuda')
+        b = torch.full([sizeb], self.val, dtype=torch.float32, device='cuda')
+
+        in_list = []
+        for i in range(repeat_tensors):
+            in_list += [a.clone().to(in_type), b.clone().to(in_type)]
+
+        if per_tensor:
+            norm, norm_per_tensor = applier(multi_tensor_unscale_l2norm, self.overflow_buf, [in_list], self.inv_scale_cuda, True)
+            normab = torch.cat(((a * self.inv_scale).norm().view(1), (b * self.inv_scale).norm().view(1)))
+            norm_per_tensor = norm_per_tensor.view(-1, 2)
+        else:
+            norm, _ = applier(multi_tensor_unscale_l2norm, self.overflow_buf, [in_list], self.inv_scale_cuda, True)
+
+        reference = torch.full([(sizea + sizeb)*repeat_tensors], self.val * self.inv_scale, dtype=torch.float32, device='cuda').norm()
+
+        self.assertTrue(torch.allclose(norm, reference))
+        if per_tensor:
+            self.assertTrue(torch.allclose(norm_per_tensor, normab))
+        self.assertTrue(self.overflow_buf.item() == 0)
+
+    @unittest.skipIf(disabled, "amp_C is unavailable")
+    def test_fuzz(self):
+        input_size_pairs = (
+            (7777*77, 555*555),
+            (777, 555),
+            (555, 2048*32+1),
+            (2048*32+1, 555),
+            (555, 2048*32),
+            (2048*32, 555),
+            (33333, 555),
+            (555, 33333))
+        appliers = (
+            MultiTensorApply(2048*32),
+            MultiTensorApply(333),
+            MultiTensorApply(33333))
+        repeat_tensors = (
+            1,
+            55)
+
+        for sizea, sizeb in input_size_pairs:
+          for applier in appliers:
+            for repeat in repeat_tensors:
+              for in_type in (torch.float32, torch.float16):
+                for per_tensor in (False, True):
+                  self.unscale_l2norm(sizea, sizeb, applier, repeat, in_type, per_tensor)
+
+
+
+if __name__ == '__main__':
+    unittest.main()