added cuda algorithm for_each and transform

twhuang-utah · twhuang-utah · commit 61860c31f589 · 2020-09-02T23:27:13.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -106,7 +106,7 @@ if(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
     error_settings
     BEFORE
     INTERFACE
-    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wall,-Wextra,-Wfatal-errors>
+    $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda -Xcompiler=-Wall,-Wextra,-Wfatal-errors>
   )
 endif(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
 
@@ -322,11 +322,11 @@ target_link_libraries(
 endif(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
 
 #### TensorFrame Project
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_EXAMPLE_DIR}/tensorframe)
-add_executable(add ${TF_EXAMPLE_DIR}/tensorframe/add.cpp)
-target_link_libraries(
-  add TensorFrame Threads::Threads tf::default_settings
-)
+#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_EXAMPLE_DIR}/tensorframe)
+#add_executable(add ${TF_EXAMPLE_DIR}/tensorframe/add.cpp)
+#target_link_libraries(
+#  add TensorFrame Threads::Threads tf::default_settings
+#)
 
 
 #### TaskflowDSL project
diff --git a/taskflow/cuda/cuda_flow.hpp b/taskflow/cuda/cuda_flow.hpp
@@ -5,6 +5,13 @@
 
 namespace tf {
 
+/**
+@brief the default number of threads per block in an 1D vector of N elements
+*/
+constexpr size_t cuda_default_threads_per_block(size_t N) {
+  return N >= 256 ? 256 : 128;
+}
+
 /**
 @class cudaFlow
 
@@ -224,84 +231,48 @@ class cudaFlow {
     // ------------------------------------------------------------------------
 
     /**
-    @brief applies a functor to each element in the range
+    @brief applies a callable to each element in the range
     
     @tparam T result type
-    @tparam F functor type
+    @tparam F callable type
 
     @param data pointer to the starting address of the data array
     @param N number of elements in the data array
-    @param functor the functor to apply to each element in the data array
+    @param callable the callable to apply to each element in the data array
     
     This method is equivalent to the parallel execution of the following loop on a GPU:
     
     @code{.cpp}
     for(size_t i=0; i<N; i++) {
-      functor(data[i]);
+      callable(data[i]);
     }
     @endcode
     */
     template <typename T, typename F>
-    cudaTask for_each(T* data, size_t N, F&& functor);
+    cudaTask for_each(T* data, size_t N, F&& callable);
   
     /**
-    @brief applies a functor to a source range and stores the result in a target ange
+    @brief applies a callable to a source range and stores the result in a target ange
     
     @tparam T result type
-    @tparam F functor type
+    @tparam F callable type
     @tparam S source types
 
     @param tgt pointer to the starting address of the target range
     @param N number of elements in the range
-    @param functor the functor to apply to each element in the range
+    @param callable the callable to apply to each element in the range
     @param srcs pointers to the starting addresses of source ranges
     
     This method is equivalent to the parallel execution of the following loop on a GPU:
     
     @code{.cpp}
     for(size_t i=0; i<N; i++) {
-      tgt[i] = functor(src1[i], src2[i], src3[i], ...);
+      tgt[i] = callable(src1[i], src2[i], src3[i], ...);
     }
     @endcode
     */
     template <typename T, typename F, typename... S>
-    cudaTask transform(T* tgt, size_t N, F&& functor, S*... srcs);
-
-    // ------------------------------------------------------------------------
-    // common arithmetic operations
-    // ------------------------------------------------------------------------
-
-    /**
-    @brief performs element-wise add operation over a list of vectors
-    
-    @tparam T result type
-    @tparam S source data types
-
-    @param res pointer to the result vector
-    @param N number of elements to add for each vector
-    @param srcs the list of vectors to add
-
-    Performs element-wise add operation over a list of vectors, @c srcs,
-    and stores the result in the vector, @c res
-    */
-    template <typename T, typename... S>
-    cudaTask add(T* res, size_t N, const S*... srcs);
-    
-    /**
-    @brief performs element-wise multiplication over a list of vectors
-    
-    @tparam T result type
-    @tparam S source data types
-
-    @param res pointer to the result vector
-    @param N number of elements to add for each vector
-    @param srcs the list of vectors to add
-
-    Performs element-wise multiplication over a list of vectors, @c srcs,
-    and stores the result in the vector, @c res
-    */
-    template <typename T, typename... S>
-    cudaTask multiply(T* res, size_t N, const S*... srcs);
+    cudaTask transform(T* tgt, size_t N, F&& callable, S*... srcs);
 
   private:
 
@@ -585,56 +556,6 @@ inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
   return cudaTask(node);
 }
 
-// Function: add
-template <typename T, typename... Us>
-cudaTask cudaFlow::add(T* res, size_t N, const Us*... srcs) {
-  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{}, 
-    [res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
-
-      cudaKernelNodeParams p;
-      void* arguments[] = { (void*)&res, (void*)&N, (void*)(&srcs)... };
-      p.func = (void*)cuda_add<T, Us...>;
-      p.gridDim = (N+256)/256;
-      p.blockDim = 256;
-      p.sharedMemBytes = 0;
-      p.kernelParams = arguments;
-      p.extra = nullptr;
-
-      TF_CHECK_CUDA(
-        ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
-        "failed to create a cudaGraph node of add task"
-      );
-    }
-  );
-
-  return cudaTask(node);
-}
-
-// Function: multiply
-template <typename T, typename... Us>
-cudaTask cudaFlow::multiply(T* res, size_t N, const Us*... srcs) {
-  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{}, 
-    [res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
-
-      cudaKernelNodeParams p;
-      void* arguments[] = { (void*)&res, (void*)&N, (void*)(&srcs)... };
-      p.func = (void*)cuda_multiply<T, Us...>;
-      p.gridDim = (N+256)/256;
-      p.blockDim = 256;
-      p.sharedMemBytes = 0;
-      p.kernelParams = arguments;
-      p.extra = nullptr;
-
-      TF_CHECK_CUDA(
-        ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
-        "failed to create a cudaGraph node of multiply task"
-      );
-    }
-  );
-
-  return cudaTask(node);
-}
-
 // Function: for_each
 template <typename T, typename F>
 cudaTask cudaFlow::for_each(T* data, size_t N, F&& functor) {
@@ -643,9 +564,10 @@ cudaTask cudaFlow::for_each(T* data, size_t N, F&& functor) {
 
       cudaKernelNodeParams p;
       void* arguments[] = { (void*)&data, (void*)&N, (void*)(&f) };
+      auto threads_per_block = cuda_default_threads_per_block(N);
       p.func = (void*)cuda_for_each<T, F>;
-      p.gridDim = (N+256)/256;
-      p.blockDim = 256;
+      p.gridDim = (N+threads_per_block-1)/threads_per_block;
+      p.blockDim = threads_per_block;
       p.sharedMemBytes = 0;
       p.kernelParams = arguments;
       p.extra = nullptr;
@@ -667,16 +589,17 @@ cudaTask cudaFlow::transform(T* tgt, size_t N, F&& functor, S*... srcs) {
 
       cudaKernelNodeParams p;
       void* arguments[] = { (void*)&tgt, (void*)&N, (void*)(&f), (void*)(&srcs)... };
+      auto threads_per_block = cuda_default_threads_per_block(N);
       p.func = (void*)cuda_transform<T, F, S...>;
-      p.gridDim = (N+256)/256;
-      p.blockDim = 256;
+      p.gridDim = (N+threads_per_block-1)/threads_per_block;
+      p.blockDim = threads_per_block;
       p.sharedMemBytes = 0;
       p.kernelParams = arguments;
       p.extra = nullptr;
 
       TF_CHECK_CUDA(
         ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
-        "failed to create a cudaGraph node of for_each task"
+        "failed to create a cudaGraph node of transform task"
       );
     }
   );
diff --git a/taskflow/cuda/cuda_ops.hpp b/taskflow/cuda/cuda_ops.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "cuda_graph.hpp"
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// for_each
+// ----------------------------------------------------------------------------
+
+// Kernel: for_each
+template <typename T, typename F>
+__global__ void cuda_for_each(T* data, size_t N, F functor) {
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i < N) {
+    functor(data[i]);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// transform
+// ----------------------------------------------------------------------------
+
+// Kernel: for_each
+template <typename T, typename F, typename... S>
+__global__ void cuda_transform(T* data, size_t N, F functor, S*... src) {
+  size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i < N) {
+    data[i] = functor(src[i]...);
+  }
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
+
diff --git a/taskflow/tensorframe.hpp b/taskflow/tensorframe.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "./tensorframe/tensorframe.hpp"
+
+namespace tf {
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/taskflow/tensorframe/tensor_ops.hpp b/taskflow/tensorframe/tensor_ops.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "tensor.hpp"
+
+namespace tf {
+
+// TODO
+template <typename T>
+void tensor_add (Tensor<T>& res, Tensor<T>& lhs, Tensor<T>& rhs) {
+
+  if(res._shape != lhs._shape || lhs._shape != rhs._shape) {
+    TF_THROW("tensor shapes do not match!");
+  }
+
+  // case 1: all tensors have data in memory
+  if(res._storage_level == MEMORY && 
+     lhs._storage_level == MEMORY && 
+     rhs._storage_level == MEMORY) {
+     
+    return;
+  }
+
+  // case 2: TODO
+
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/unittests/cuda/cuda_algorithm.cu b/unittests/cuda/cuda_algorithm.cu