NVIDIA
diff --git a/‎apex/optimizers/csrc/fused_adam_cuda.cpp‎ ‎csrc/fused_adam_cuda.cpp‎apex/optimizers/csrc/fused_adam_cuda.cpp renamed to csrc/fused_adam_cuda.cpp b/‎apex/optimizers/csrc/fused_adam_cuda.cpp‎ ‎csrc/fused_adam_cuda.cpp‎apex/optimizers/csrc/fused_adam_cuda.cpp renamed to csrc/fused_adam_cuda.cpp
diff --git a/‎…ptimizers/csrc/fused_adam_cuda_kernel.cu‎ ‎csrc/fused_adam_cuda_kernel.cu‎apex/optimizers/csrc/fused_adam_cuda_kernel.cu renamed to csrc/fused_adam_cuda_kernel.cu
Lines changed: 11 additions & 5 deletions b/‎…ptimizers/csrc/fused_adam_cuda_kernel.cu‎ ‎csrc/fused_adam_cuda_kernel.cu‎apex/optimizers/csrc/fused_adam_cuda_kernel.cu renamed to csrc/fused_adam_cuda_kernel.cu
Lines changed: 11 additions & 5 deletions
diff --git a/‎…x/normalization/csrc/layer_norm_cuda.cpp‎ ‎csrc/layer_norm_cuda.cpp‎apex/normalization/csrc/layer_norm_cuda.cpp renamed to csrc/layer_norm_cuda.cpp b/‎…x/normalization/csrc/layer_norm_cuda.cpp‎ ‎csrc/layer_norm_cuda.cpp‎apex/normalization/csrc/layer_norm_cuda.cpp renamed to csrc/layer_norm_cuda.cpp
diff --git a/‎…alization/csrc/layer_norm_cuda_kernel.cu‎ ‎csrc/layer_norm_cuda_kernel.cu‎apex/normalization/csrc/layer_norm_cuda_kernel.cu renamed to csrc/layer_norm_cuda_kernel.cu
Lines changed: 6 additions & 2 deletions b/‎…alization/csrc/layer_norm_cuda_kernel.cu‎ ‎csrc/layer_norm_cuda_kernel.cu‎apex/normalization/csrc/layer_norm_cuda_kernel.cu renamed to csrc/layer_norm_cuda_kernel.cu
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/multi_tensor_apply.cuh‎
Lines changed: 2 additions & 2 deletions b/‎csrc/multi_tensor_apply.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/multi_tensor_scale_kernel.cu‎
Lines changed: 16 additions & 6 deletions b/‎csrc/multi_tensor_scale_kernel.cu‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎csrc/type_shim.h‎
Lines changed: 14 additions & 0 deletions b/‎csrc/type_shim.h‎
Lines changed: 14 additions & 0 deletions
@@ -10,6 +10,8 @@
 #include "ATen/AccumulateType.h"
 #include <THC/THCGeneral.h>
 
+#include "type_shim.h"
+
 typedef enum{
     ADAM_MODE_0   =0, // eps under square root
     ADAM_MODE_1   =1  // eps outside square root
@@ -29,8 +31,8 @@ __global__ void adam_cuda_kernel(
         const float step_size,
         const size_t tsize,
         adamMode_t mode,
-        const float decay) {
-
+        const float decay)
+{
         //Assuming 2D grids and 2D blocks
         const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
         const int threadsPerBlock = blockDim.x * blockDim.y;
@@ -67,7 +69,9 @@ void fused_adam_cuda(
         int step,
         int mode,
         int bias_correction,
-        float decay) {
+        float decay)
+{
+//        using namespace at;
 
         //Get tensor size
         int tsize = p.numel();
@@ -91,7 +95,8 @@ void fused_adam_cuda(
 //all other values should be fp32 for half gradients
             AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float, "expected parameter to be of float type");
 //dispatch is done on the gradient type
-            AT_DISPATCH_FLOATING_TYPES_AND_HALF(g.type(), "adam_cuda_kernel", ([&] {
+            using namespace at; // prevents "toString is undefined" errors
+            AT_DISPATCH_FLOATING_TYPES_AND_HALF(TypeShim(g.type()), "adam_cuda_kernel", ([&] {
                 using accscalar_t = at::acc_type<scalar_t, true>;
                 adam_cuda_kernel<accscalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
                         p.data<accscalar_t>(),
@@ -109,7 +114,8 @@ void fused_adam_cuda(
                         decay);
             }));
       } else {
-            AT_DISPATCH_FLOATING_TYPES(g.type(), "adam_cuda_kernel", ([&] {
+            using namespace at;
+            AT_DISPATCH_FLOATING_TYPES(TypeShim(g.type()), "adam_cuda_kernel", ([&] {
                 adam_cuda_kernel<scalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
                         p.data<scalar_t>(),
                         NULL, //don't output p_copy for fp32, it's wasted write
 
@@ -6,6 +6,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include "type_shim.h"
+
 template<typename U> __device__
 void cuWelfordOnlineSum(
   const U curr,
@@ -675,7 +677,8 @@ void cuda_layer_norm(
     at::Tensor* beta,
     double epsilon)
 {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "layer_norm_cuda_kernel", ([&] {
+    using namespace at;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(TypeShim(input->type()), "layer_norm_cuda_kernel", ([&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
         HostApplyLayerNorm(
             output->data<scalar_t>(),
@@ -772,7 +775,8 @@ void cuda_layer_norm_gradient(
     at::Tensor* grad_gamma,
     at::Tensor* grad_beta)
 {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "cuComputeGradInput", ([&] {
+    using namespace at;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(TypeShim(input->type()), "cuComputeGradInput", ([&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
         HostLayerNormGradient(
 	    dout->data<scalar_t>(),
 
@@ -14,7 +14,7 @@
 constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
 
-template<int n> struct TensorList
+template<int n> struct TensorListMetadata
 {
   void* addresses[n][depth_to_max_tensors[n-1]];
   int sizes[depth_to_max_tensors[n-1]];
@@ -62,7 +62,7 @@ void multi_tensor_apply(
 
   int ntensors = tensor_lists[0].size();
 
-  TensorList<depth> tl;
+  TensorListMetadata<depth> tl;
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
 
@@ -2,9 +2,15 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
-#include "multi_tensor_apply.cuh"
+// Another possibility:
+// #include <torch/all.h>
 
 #include <assert.h>
+// Stringstream is a big hammer, but I want to rely on operator<< for dtype.
+#include <sstream>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
 
 #define BLOCK_SIZE 512
 #define ILP 4
@@ -15,7 +21,7 @@ struct ScaleFunctor
    __device__ __forceinline__ void operator()(
     int chunk_size,
     volatile int* noop_gmem,
-    TensorList<2>& tl,
+    TensorListMetadata<2>& tl,
     float scale)
   {
     __shared__ int noop_smem;
@@ -87,15 +93,17 @@ void multi_tensor_scale_cuda(
   std::vector<std::vector<at::Tensor>> tensor_lists,
   float scale)
 {
+  using namespace at;
   // The output (downscaled) type is always float.
   // If build times suffer, think about where to put this dispatch,
   // and what logic should be moved out of multi_tensor_apply.
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor_lists[0][0].type(),
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(TypeShim(tensor_lists[0][0].type()),
      "multi_tensor_scale_cuda",
      [&]
      {
        // using accscalar_t = acc_type<scalar_t, true>;
-       switch(tensor_lists[1][0].type().scalarType())
+       switch(tensor_lists[1][0].scalar_type())
        {
          case at::ScalarType::Half:
            multi_tensor_apply<2>(
@@ -116,8 +124,10 @@ void multi_tensor_scale_cuda(
              scale);
            break;
          default:
-           AT_ERROR("multi_tensor_scale_cuda not implemented for output type = ",
-                    tensor_lists[1][0].type().toString());
+           std::stringstream ss;
+           ss << "multi_tensor_scale_cuda not implemented for output type = "
+              << tensor_lists[1][0].dtype();
+           AT_ERROR(ss.str().c_str());
        }
      });
 
 
@@ -0,0 +1,14 @@
+#include <ATen/ATen.h>
+
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+struct TypeShim
+{
+  const at::Type& payload;
+  TypeShim(const at::Type& type) : payload(type) {}
+  // Enable trivial conversion to a const at::Type& for pre-3aeb78
+  operator const at::Type&(){ return payload; };
+  // Enable dispatch switch statements to take *this directly for  post-3aeb78
+  operator at::ScalarType(){ return payload.scalarType(); };
+};