5
5
6
6
namespace tf {
7
7
8
+ /* *
9
+ @brief the default number of threads per block in an 1D vector of N elements
10
+ */
11
+ constexpr size_t cuda_default_threads_per_block (size_t N) {
12
+ return N >= 256 ? 256 : 128 ;
13
+ }
14
+
8
15
/* *
9
16
@class cudaFlow
10
17
@@ -224,84 +231,48 @@ class cudaFlow {
224
231
// ------------------------------------------------------------------------
225
232
226
233
/* *
227
- @brief applies a functor to each element in the range
234
+ @brief applies a callable to each element in the range
228
235
229
236
@tparam T result type
230
- @tparam F functor type
237
+ @tparam F callable type
231
238
232
239
@param data pointer to the starting address of the data array
233
240
@param N number of elements in the data array
234
- @param functor the functor to apply to each element in the data array
241
+ @param callable the callable to apply to each element in the data array
235
242
236
243
This method is equivalent to the parallel execution of the following loop on a GPU:
237
244
238
245
@code{.cpp}
239
246
for(size_t i=0; i<N; i++) {
240
- functor (data[i]);
247
+ callable (data[i]);
241
248
}
242
249
@endcode
243
250
*/
244
251
template <typename T, typename F>
245
- cudaTask for_each (T* data, size_t N, F&& functor );
252
+ cudaTask for_each (T* data, size_t N, F&& callable );
246
253
247
254
/* *
248
- @brief applies a functor to a source range and stores the result in a target ange
255
+ @brief applies a callable to a source range and stores the result in a target ange
249
256
250
257
@tparam T result type
251
- @tparam F functor type
258
+ @tparam F callable type
252
259
@tparam S source types
253
260
254
261
@param tgt pointer to the starting address of the target range
255
262
@param N number of elements in the range
256
- @param functor the functor to apply to each element in the range
263
+ @param callable the callable to apply to each element in the range
257
264
@param srcs pointers to the starting addresses of source ranges
258
265
259
266
This method is equivalent to the parallel execution of the following loop on a GPU:
260
267
261
268
@code{.cpp}
262
269
for(size_t i=0; i<N; i++) {
263
- tgt[i] = functor (src1[i], src2[i], src3[i], ...);
270
+ tgt[i] = callable (src1[i], src2[i], src3[i], ...);
264
271
}
265
272
@endcode
266
273
*/
267
274
template <typename T, typename F, typename ... S>
268
- cudaTask transform (T* tgt, size_t N, F&& functor, S*... srcs);
269
-
270
- // ------------------------------------------------------------------------
271
- // common arithmetic operations
272
- // ------------------------------------------------------------------------
273
-
274
- /* *
275
- @brief performs element-wise add operation over a list of vectors
276
-
277
- @tparam T result type
278
- @tparam S source data types
279
-
280
- @param res pointer to the result vector
281
- @param N number of elements to add for each vector
282
- @param srcs the list of vectors to add
283
-
284
- Performs element-wise add operation over a list of vectors, @c srcs,
285
- and stores the result in the vector, @c res
286
- */
287
- template <typename T, typename ... S>
288
- cudaTask add (T* res, size_t N, const S*... srcs);
289
-
290
- /* *
291
- @brief performs element-wise multiplication over a list of vectors
292
-
293
- @tparam T result type
294
- @tparam S source data types
295
-
296
- @param res pointer to the result vector
297
- @param N number of elements to add for each vector
298
- @param srcs the list of vectors to add
299
-
300
- Performs element-wise multiplication over a list of vectors, @c srcs,
301
- and stores the result in the vector, @c res
302
- */
303
- template <typename T, typename ... S>
304
- cudaTask multiply (T* res, size_t N, const S*... srcs);
275
+ cudaTask transform (T* tgt, size_t N, F&& callable, S*... srcs);
305
276
306
277
private:
307
278
@@ -585,56 +556,6 @@ inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
585
556
return cudaTask (node);
586
557
}
587
558
588
- // Function: add
589
- template <typename T, typename ... Us>
590
- cudaTask cudaFlow::add (T* res, size_t N, const Us*... srcs) {
591
- auto node = _graph.emplace_back (nstd::in_place_type_t <cudaNode::Kernel>{},
592
- [res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
593
-
594
- cudaKernelNodeParams p;
595
- void * arguments[] = { (void *)&res, (void *)&N, (void *)(&srcs)... };
596
- p.func = (void *)cuda_add<T, Us...>;
597
- p.gridDim = (N+256 )/256 ;
598
- p.blockDim = 256 ;
599
- p.sharedMemBytes = 0 ;
600
- p.kernelParams = arguments;
601
- p.extra = nullptr ;
602
-
603
- TF_CHECK_CUDA (
604
- ::cudaGraphAddKernelNode (&node, graph, nullptr , 0 , &p),
605
- "failed to create a cudaGraph node of add task"
606
- );
607
- }
608
- );
609
-
610
- return cudaTask (node);
611
- }
612
-
613
- // Function: multiply
614
- template <typename T, typename ... Us>
615
- cudaTask cudaFlow::multiply (T* res, size_t N, const Us*... srcs) {
616
- auto node = _graph.emplace_back (nstd::in_place_type_t <cudaNode::Kernel>{},
617
- [res, N, srcs...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
618
-
619
- cudaKernelNodeParams p;
620
- void * arguments[] = { (void *)&res, (void *)&N, (void *)(&srcs)... };
621
- p.func = (void *)cuda_multiply<T, Us...>;
622
- p.gridDim = (N+256 )/256 ;
623
- p.blockDim = 256 ;
624
- p.sharedMemBytes = 0 ;
625
- p.kernelParams = arguments;
626
- p.extra = nullptr ;
627
-
628
- TF_CHECK_CUDA (
629
- ::cudaGraphAddKernelNode (&node, graph, nullptr , 0 , &p),
630
- "failed to create a cudaGraph node of multiply task"
631
- );
632
- }
633
- );
634
-
635
- return cudaTask (node);
636
- }
637
-
638
559
// Function: for_each
639
560
template <typename T, typename F>
640
561
cudaTask cudaFlow::for_each (T* data, size_t N, F&& functor) {
@@ -643,9 +564,10 @@ cudaTask cudaFlow::for_each(T* data, size_t N, F&& functor) {
643
564
644
565
cudaKernelNodeParams p;
645
566
void * arguments[] = { (void *)&data, (void *)&N, (void *)(&f) };
567
+ auto threads_per_block = cuda_default_threads_per_block (N);
646
568
p.func = (void *)cuda_for_each<T, F>;
647
- p.gridDim = (N+256 )/ 256 ;
648
- p.blockDim = 256 ;
569
+ p.gridDim = (N+threads_per_block- 1 )/threads_per_block ;
570
+ p.blockDim = threads_per_block ;
649
571
p.sharedMemBytes = 0 ;
650
572
p.kernelParams = arguments;
651
573
p.extra = nullptr ;
@@ -667,16 +589,17 @@ cudaTask cudaFlow::transform(T* tgt, size_t N, F&& functor, S*... srcs) {
667
589
668
590
cudaKernelNodeParams p;
669
591
void * arguments[] = { (void *)&tgt, (void *)&N, (void *)(&f), (void *)(&srcs)... };
592
+ auto threads_per_block = cuda_default_threads_per_block (N);
670
593
p.func = (void *)cuda_transform<T, F, S...>;
671
- p.gridDim = (N+256 )/ 256 ;
672
- p.blockDim = 256 ;
594
+ p.gridDim = (N+threads_per_block- 1 )/threads_per_block ;
595
+ p.blockDim = threads_per_block ;
673
596
p.sharedMemBytes = 0 ;
674
597
p.kernelParams = arguments;
675
598
p.extra = nullptr ;
676
599
677
600
TF_CHECK_CUDA (
678
601
::cudaGraphAddKernelNode (&node, graph, nullptr , 0 , &p),
679
- "failed to create a cudaGraph node of for_each task"
602
+ "failed to create a cudaGraph node of transform task"
680
603
);
681
604
}
682
605
);
0 commit comments