Null000
diff --git a/‎tensorflow/core/kernels/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/core/kernels/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow/core/kernels/conv_2d.h‎
Lines changed: 20 additions & 0 deletions b/‎tensorflow/core/kernels/conv_2d.h‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/conv_ops_gpu_3.cu.cc‎
Lines changed: 59 additions & 15 deletions b/‎tensorflow/core/kernels/conv_ops_gpu_3.cu.cc‎
Lines changed: 59 additions & 15 deletions
diff --git a/‎tensorflow/core/kernels/transpose_functor.h‎
Lines changed: 7 additions & 18 deletions b/‎tensorflow/core/kernels/transpose_functor.h‎
Lines changed: 7 additions & 18 deletions
diff --git a/‎tensorflow/core/kernels/transpose_functor_cpu.cc‎
Lines changed: 45 additions & 11 deletions b/‎tensorflow/core/kernels/transpose_functor_cpu.cc‎
Lines changed: 45 additions & 11 deletions
@@ -1105,6 +1105,7 @@ tf_kernel_library(
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:conv_ops",
         "//third_party/eigen3",
     ],
     alwayslink = 0,
 
@@ -256,6 +256,26 @@ struct NCHWToNHWC {
                   typename TTypes<T, NDIMS>::Tensor out);
 };
 
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim0, dim2, dim1]
+template <typename Device, typename T>
+struct SwapDimension1And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& input_dims, T* out);
+};
+
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim2, dim1, dim0]
+template <typename Device, typename T>
+struct SwapDimension0And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& input_dims, T* out);
+};
+
 // Reverses the effect of TransformFilter above.
 template <typename Device, typename T, int NDIMS>
 struct ReverseTransformFilter {
 
@@ -126,9 +126,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
 
 // A Cuda custom kernel that swaps dimension-0 and dimension-2 of a 3D tensor.
 template <typename T>
-__global__ void SwapDimension0And2InTensor3(int nthreads, const T* input,
-                                            Dimension<3> input_dims,
-                                            T* output) {
+__global__ void SwapDimension0And2InTensor3Simple(int nthreads, const T* input,
+                                                  Dimension<3> input_dims,
+                                                  T* output) {
   Dimension<3> output_dims;
   output_dims[0] = input_dims[2];
   output_dims[1] = input_dims[1];
@@ -152,9 +152,9 @@ __global__ void SwapDimension0And2InTensor3(int nthreads, const T* input,
 
 // A Cuda custom kernel that swaps dimension-1 and dimension-2 of a 3D tensor.
 template <typename T>
-__global__ void SwapDimension1And2InTensor3(int nthreads, const T* input,
-                                            Dimension<3> input_dims,
-                                            T* output) {
+__global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
+                                                  Dimension<3> input_dims,
+                                                  T* output) {
   Dimension<3> output_dims;
   output_dims[0] = input_dims[0];
   output_dims[1] = input_dims[2];
@@ -348,9 +348,9 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
     combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
     combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    SwapDimension0And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, in.data(), combined_dims, out.data());
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
   }
 };
 
@@ -368,9 +368,9 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
       combined_dims[2] *= in.dimension(i);
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    SwapDimension0And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, in.data(), combined_dims, out.data());
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
   }
 };
 
@@ -442,12 +442,44 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
-    SwapDimension1And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, input_dims, output);
+    SwapDimension1And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, input_dims, output);
   }
 }
 
+// A GPU helper functor that does general dimension 1 and 2 switch for 3D
+// tensor.
+template <typename T>
+struct SwapDimension1And2InTensor3<GPUDevice, T> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    RunSwapDimension1And2InTensor3(d, in, input_dims, out);
+  }
+};
+
+// A GPU helper functor that does general dimension 0 and 2 switch for 3D
+// tensor.
+template <typename T>
+struct SwapDimension0And2InTensor3<GPUDevice, T> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in, input_dims, out);
+  }
+};
+
 // A GPU helper functor that converts NHWC TensorFlow data format to
 // NCHW format that is accepted by Cudnn.
 template <typename T, int NDIMS>
@@ -497,6 +529,18 @@ template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
 template struct functor::TransformDepth<GPUDevice, float, int>;
 template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
 
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
+
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
+
 // For 2d ops.
 template struct functor::TransformFilter<GPUDevice, float, int, 4>;
 template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 
@@ -111,25 +111,14 @@ template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
                          const gtl::ArraySlice<int32> perm, Tensor* out);
 
-template <typename Device, typename T>
-void Transpose(const Device& d, const Tensor& in,
-               const gtl::ArraySlice<int32> perm, Tensor* out) {
-  switch (in.dims()) {
-    case 2:
-      TransposeUsingEigen<Device, T, 2>(d, in, perm, out);
-      break;
-    case 3:
-      TransposeUsingEigen<Device, T, 3>(d, in, perm, out);
-      break;
-    case 4:
-      TransposeUsingEigen<Device, T, 4>(d, in, perm, out);
-      break;
-    default:
-      TransposeSimple<Device, T>(d, in, perm, out);
-      break;
-  }
-}
 }  // namespace internal
+
+template <typename Device, typename T>
+struct Transpose {
+  static void run(const Device& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_TRANSPOSE_FUNCTOR_H_
@@ -61,11 +61,38 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 }  // end namespace internal
 
-typedef Eigen::ThreadPoolDevice Device;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename T>
+struct Transpose<CPUDevice, T> {
+  static void run(const CPUDevice& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+    switch (in.dims()) {
+      case 2:
+        internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, out);
+        break;
+      case 3:
+        internal::TransposeUsingEigen<CPUDevice, T, 3>(d, in, perm, out);
+        break;
+      case 4:
+        internal::TransposeUsingEigen<CPUDevice, T, 4>(d, in, perm, out);
+        break;
+      case 5:
+        internal::TransposeUsingEigen<CPUDevice, T, 5>(d, in, perm, out);
+        break;
+      default:
+        internal::TransposeSimple<CPUDevice, T>(d, in, perm, out);
+        break;
+    }
+  }
+};
 
+// TODO(yangzihao): Merge this code with its GPU counterpart to reduce code
+// duplication.
 template <>
-Status DoTranspose<Device>(const Device& d, const Tensor& in,
-                           const gtl::ArraySlice<int32> perm, Tensor* out) {
+Status DoTranspose<CPUDevice>(const CPUDevice& d, const Tensor& in,
+                              const gtl::ArraySlice<int32> perm, Tensor* out) {
+  typedef CPUDevice Device;
   CHECK_GE(in.dims(), 2);
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
@@ -76,7 +103,7 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT8:
     case DT_QUINT8:
     case DT_UINT8:
-      internal::Transpose<Device, uint8>(d, in, perm, out);
+      Transpose<Device, uint8>::run(d, in, perm, out);
       break;
 
     case DT_BFLOAT16:
@@ -85,27 +112,27 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT16:
     case DT_QUINT16:
     case DT_UINT16:
-      internal::Transpose<Device, uint16>(d, in, perm, out);
+      Transpose<Device, uint16>::run(d, in, perm, out);
       break;
 
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
-      internal::Transpose<Device, uint32>(d, in, perm, out);
+      Transpose<Device, uint32>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX64:
     case DT_DOUBLE:
     case DT_INT64:
-      internal::Transpose<Device, uint64>(d, in, perm, out);
+      Transpose<Device, uint64>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX128:
-      internal::Transpose<Device, complex128>(d, in, perm, out);
+      Transpose<Device, complex128>::run(d, in, perm, out);
       break;
 
     case DT_STRING:
-      internal::Transpose<Device, string>(d, in, perm, out);
+      Transpose<Device, string>::run(d, in, perm, out);
       break;
 
     default:
@@ -117,6 +144,14 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 
+template <typename T>
+struct internal::Transpose<SYCLDevice, T> {
+  static void run(const SYCLDevice& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+    // Should add a specialized implementation for SYCLDevice here.
+  }
+};
+
 template <>
 Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
                            const gtl::ArraySlice<int32> perm, Tensor* out) {
@@ -125,11 +160,10 @@ Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
   CHECK_EQ(in.dims(), perm.size());
   CHECK_EQ(in.dtype(), out->dtype());
   switch (in.dtype()) {
-
     case DT_FLOAT:
     case DT_DOUBLE:
     case DT_INT32:
-      internal::Transpose<SYCLDevice, uint32>(d, in, perm, out);
+      internal::Transpose<SYCLDevice, uint32>::run(d, in, perm, out);
       break;
 
     default: