Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 61860c3

Browse files
committed
added cuda algorithm for_each and transform
1 parent 2158bb0 commit 61860c3

File tree

6 files changed

+534
-107
lines changed

6 files changed

+534
-107
lines changed

CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ if(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
106106
error_settings
107107
BEFORE
108108
INTERFACE
109-
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wall,-Wextra,-Wfatal-errors>
109+
$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda -Xcompiler=-Wall,-Wextra,-Wfatal-errors>
110110
)
111111
endif(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
112112

@@ -322,11 +322,11 @@ target_link_libraries(
322322
endif(CMAKE_CUDA_COMPILER AND TF_BUILD_CUDA)
323323

324324
#### TensorFrame Project
325-
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_EXAMPLE_DIR}/tensorframe)
326-
add_executable(add ${TF_EXAMPLE_DIR}/tensorframe/add.cpp)
327-
target_link_libraries(
328-
add TensorFrame Threads::Threads tf::default_settings
329-
)
325+
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_EXAMPLE_DIR}/tensorframe)
326+
#add_executable(add ${TF_EXAMPLE_DIR}/tensorframe/add.cpp)
327+
#target_link_libraries(
328+
# add TensorFrame Threads::Threads tf::default_settings
329+
#)
330330

331331

332332
#### TaskflowDSL project

taskflow/cuda/cuda_flow.hpp

Lines changed: 24 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55

66
namespace tf {
77

8+
/**
9+
@brief the default number of threads per block in an 1D vector of N elements
10+
*/
11+
constexpr size_t cuda_default_threads_per_block(size_t N) {
12+
return N >= 256 ? 256 : 128;
13+
}
14+
815
/**
916
@class cudaFlow
1017
@@ -224,84 +231,48 @@ class cudaFlow {
224231
// ------------------------------------------------------------------------
225232

226233
/**
227-
@brief applies a functor to each element in the range
234+
@brief applies a callable to each element in the range
228235
229236
@tparam T result type
230-
@tparam F functor type
237+
@tparam F callable type
231238
232239
@param data pointer to the starting address of the data array
233240
@param N number of elements in the data array
234-
@param functor the functor to apply to each element in the data array
241+
@param callable the callable to apply to each element in the data array
235242
236243
This method is equivalent to the parallel execution of the following loop on a GPU:
237244
238245
@code{.cpp}
239246
for(size_t i=0; i<N; i++) {
240-
functor(data[i]);
247+
callable(data[i]);
241248
}
242249
@endcode
243250
*/
244251
template <typename T, typename F>
245-
cudaTask for_each(T* data, size_t N, F&& functor);
252+
cudaTask for_each(T* data, size_t N, F&& callable);
246253

247254
/**
248-
@brief applies a functor to a source range and stores the result in a target ange
255+
@brief applies a callable to a source range and stores the result in a target ange
249256
250257
@tparam T result type
251-
@tparam F functor type
258+
@tparam F callable type
252259
@tparam S source types
253260
254261
@param tgt pointer to the starting address of the target range
255262
@param N number of elements in the range
256-
@param functor the functor to apply to each element in the range
263+
@param callable the callable to apply to each element in the range
257264
@param srcs pointers to the starting addresses of source ranges
258265
259266
This method is equivalent to the parallel execution of the following loop on a GPU:
260267
261268
@code{.cpp}
262269
for(size_t i=0; i<N; i++) {
263-
tgt[i] = functor(src1[i], src2[i], src3[i], ...);
270+
tgt[i] = callable(src1[i], src2[i], src3[i], ...);
264271
}
265272
@endcode
266273
*/
267274
template <typename T, typename F, typename... S>
268-
cudaTask transform(T* tgt, size_t N, F&& functor, S*... srcs);
269-
270-
// ------------------------------------------------------------------------
271-
// common arithmetic operations
272-
// ------------------------------------------------------------------------
273-
274-
/**
275-
@brief performs element-wise add operation over a list of vectors
276-
277-
@tparam T result type
278-
@tparam S source data types
279-
280-
@param res pointer to the result vector
281-
@param N number of elements to add for each vector
282-
@param srcs the list of vectors to add
283-
284-
Performs element-wise add operation over a list of vectors, @c srcs,
285-
and stores the result in the vector, @c res
286-
*/
287-
template <typename T, typename... S>
288-
cudaTask add(T* res, size_t N, const S*... srcs);
289-
290-
/**
291-
@brief performs element-wise multiplication over a list of vectors
292-
293-
@tparam T result type
294-
@tparam S source data types
295-
296-
@param res pointer to the result vector
297-
@param N number of elements to add for each vector
298-
@param srcs the list of vectors to add
299-
300-
Performs element-wise multiplication over a list of vectors, @c srcs,
301-
and stores the result in the vector, @c res
302-
*/
303-
template <typename T, typename... S>
304-
cudaTask multiply(T* res, size_t N, const S*... srcs);
275+
cudaTask transform(T* tgt, size_t N, F&& callable, S*... srcs);
305276

306277
private:
307278

@@ -585,56 +556,6 @@ inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
585556
return cudaTask(node);
586557
}
587558

588-
// Function: add
589-
template <typename T, typename... Us>
590-
cudaTask cudaFlow::add(T* res, size_t N, const Us*... srcs) {
591-
auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
592-
[res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
593-
594-
cudaKernelNodeParams p;
595-
void* arguments[] = { (void*)&res, (void*)&N, (void*)(&srcs)... };
596-
p.func = (void*)cuda_add<T, Us...>;
597-
p.gridDim = (N+256)/256;
598-
p.blockDim = 256;
599-
p.sharedMemBytes = 0;
600-
p.kernelParams = arguments;
601-
p.extra = nullptr;
602-
603-
TF_CHECK_CUDA(
604-
::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
605-
"failed to create a cudaGraph node of add task"
606-
);
607-
}
608-
);
609-
610-
return cudaTask(node);
611-
}
612-
613-
// Function: multiply
614-
template <typename T, typename... Us>
615-
cudaTask cudaFlow::multiply(T* res, size_t N, const Us*... srcs) {
616-
auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
617-
[res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
618-
619-
cudaKernelNodeParams p;
620-
void* arguments[] = { (void*)&res, (void*)&N, (void*)(&srcs)... };
621-
p.func = (void*)cuda_multiply<T, Us...>;
622-
p.gridDim = (N+256)/256;
623-
p.blockDim = 256;
624-
p.sharedMemBytes = 0;
625-
p.kernelParams = arguments;
626-
p.extra = nullptr;
627-
628-
TF_CHECK_CUDA(
629-
::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
630-
"failed to create a cudaGraph node of multiply task"
631-
);
632-
}
633-
);
634-
635-
return cudaTask(node);
636-
}
637-
638559
// Function: for_each
639560
template <typename T, typename F>
640561
cudaTask cudaFlow::for_each(T* data, size_t N, F&& functor) {
@@ -643,9 +564,10 @@ cudaTask cudaFlow::for_each(T* data, size_t N, F&& functor) {
643564

644565
cudaKernelNodeParams p;
645566
void* arguments[] = { (void*)&data, (void*)&N, (void*)(&f) };
567+
auto threads_per_block = cuda_default_threads_per_block(N);
646568
p.func = (void*)cuda_for_each<T, F>;
647-
p.gridDim = (N+256)/256;
648-
p.blockDim = 256;
569+
p.gridDim = (N+threads_per_block-1)/threads_per_block;
570+
p.blockDim = threads_per_block;
649571
p.sharedMemBytes = 0;
650572
p.kernelParams = arguments;
651573
p.extra = nullptr;
@@ -667,16 +589,17 @@ cudaTask cudaFlow::transform(T* tgt, size_t N, F&& functor, S*... srcs) {
667589

668590
cudaKernelNodeParams p;
669591
void* arguments[] = { (void*)&tgt, (void*)&N, (void*)(&f), (void*)(&srcs)... };
592+
auto threads_per_block = cuda_default_threads_per_block(N);
670593
p.func = (void*)cuda_transform<T, F, S...>;
671-
p.gridDim = (N+256)/256;
672-
p.blockDim = 256;
594+
p.gridDim = (N+threads_per_block-1)/threads_per_block;
595+
p.blockDim = threads_per_block;
673596
p.sharedMemBytes = 0;
674597
p.kernelParams = arguments;
675598
p.extra = nullptr;
676599

677600
TF_CHECK_CUDA(
678601
::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
679-
"failed to create a cudaGraph node of for_each task"
602+
"failed to create a cudaGraph node of transform task"
680603
);
681604
}
682605
);

taskflow/cuda/cuda_ops.hpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#pragma once
2+
3+
#include "cuda_graph.hpp"
4+
5+
namespace tf {
6+
7+
// ----------------------------------------------------------------------------
8+
// for_each
9+
// ----------------------------------------------------------------------------
10+
11+
// Kernel: for_each
12+
template <typename T, typename F>
13+
__global__ void cuda_for_each(T* data, size_t N, F functor) {
14+
size_t i = blockIdx.x*blockDim.x + threadIdx.x;
15+
if (i < N) {
16+
functor(data[i]);
17+
}
18+
}
19+
20+
// ----------------------------------------------------------------------------
21+
// transform
22+
// ----------------------------------------------------------------------------
23+
24+
// Kernel: for_each
25+
template <typename T, typename F, typename... S>
26+
__global__ void cuda_transform(T* data, size_t N, F functor, S*... src) {
27+
size_t i = blockIdx.x*blockDim.x + threadIdx.x;
28+
if (i < N) {
29+
data[i] = functor(src[i]...);
30+
}
31+
}
32+
33+
34+
} // end of namespace tf -----------------------------------------------------
35+
36+
37+
38+
39+
40+
41+

taskflow/tensorframe.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma once
2+
3+
#include "./tensorframe/tensorframe.hpp"
4+
5+
namespace tf {
6+
7+
} // end of namespace tf -----------------------------------------------------
8+
9+
10+
11+
12+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#pragma once
2+
3+
#include "tensor.hpp"
4+
5+
namespace tf {
6+
7+
// TODO
8+
template <typename T>
9+
void tensor_add (Tensor<T>& res, Tensor<T>& lhs, Tensor<T>& rhs) {
10+
11+
if(res._shape != lhs._shape || lhs._shape != rhs._shape) {
12+
TF_THROW("tensor shapes do not match!");
13+
}
14+
15+
// case 1: all tensors have data in memory
16+
if(res._storage_level == MEMORY &&
17+
lhs._storage_level == MEMORY &&
18+
rhs._storage_level == MEMORY) {
19+
20+
return;
21+
}
22+
23+
// case 2: TODO
24+
25+
}
26+
27+
28+
} // end of namespace tf -----------------------------------------------------

0 commit comments

Comments
 (0)