55#include < random>
66#include < cuda/std/atomic>
77
8- __host__ __device__ void reduceBasic (int tId, int numThreads, int N, const int * input, cuda::std::atomic<int >* result)
8+ __host__ __device__ void reduceAtomic (int tId, int numThreads, int N, const int * input, cuda::std::atomic<int >* result)
99{
1010 if (tId >= N)
1111 return ;
@@ -15,16 +15,23 @@ __host__ __device__ void reduceBasic(int tId, int numThreads, int N, const int*
1515 int myEnd = (tId == numThreads - 1 ) ? N : myStart + perThread;
1616
1717 for (int i = myStart; i < myEnd; i++)
18- {
19- int val = input[i];
20- result->fetch_add (val);
21- }
18+ result->fetch_add (input[i]);
2219}
2320
24- __global__ void launchReduceBasic (int N, const int * input, cuda::std::atomic<int >* result)
21+ __global__ void launchReductionGPU (int N, const int * input, cuda::std::atomic<int >* result)
2522{
2623 int tId = blockIdx .x * blockDim .x + threadIdx .x ;
27- reduceBasic (tId, N, N, input, result);
24+ reduceAtomic (tId, N, N, input, result);
25+ }
26+
27+ template <unsigned int NUM_THREADS>
28+ __host__ void launchReductionCPU (int N, int * mNumbers , cuda::std::atomic<int >* result)
29+ {
30+ std::vector<std::thread> threads (NUM_THREADS);
31+ for (int i = 0 ; i < threads.size (); i++)
32+ threads[i] = std::thread (reduceAtomic, i, NUM_THREADS, N, mNumbers , result);
33+ for (std::thread& t : threads)
34+ t.join ();
2835}
2936
3037int main ()
@@ -34,35 +41,24 @@ int main()
3441 of the standard in C++20. Soon!
3542 */
3643 constexpr int N = 1 <<16 ;
37- constexpr int cpuNumThreads = 4 ;
38- constexpr int gpuBlockSize = 256 ;
39-
40- int * mNumbers ;
41- cudaMallocManaged ((void **)&mNumbers , sizeof (int ) * N);
4244
4345 std::default_random_engine eng (42 );
4446 std::uniform_int_distribution<int > dist (10 , 42 );
4547
46- cuda::std::atomic<int >* mResults ;
47- cudaMallocManaged ((void **)&mResults , 2 * sizeof (cuda::std::atomic<int >));
48- new (mResults ) cuda::std::atomic<int >(0 );
49- new (mResults +1 ) cuda::std::atomic<int >(0 );
50-
48+ int * mNumbers ;
49+ cudaMallocManaged ((void **)&mNumbers , sizeof (int ) * N);
5150 for (int i = 0 ; i < N; i++)
5251 mNumbers [i] = dist (eng);
5352
54- std::vector<std::thread> threads (cpuNumThreads);
55- for (int i = 0 ; i < threads.size (); i++)
56- threads[i] = std::thread (reduceBasic, i, cpuNumThreads, N, mNumbers , &mResults [0 ]);
57-
58- for (std::thread& t : threads)
59- t.join ();
60-
61- std::cout << " Reduction result CPU: " << mResults [0 ] << " \n " << std::endl;
53+ cuda::std::atomic<int >* mResults ;
54+ cudaMallocManaged ((void **)&mResults , 2 * sizeof (cuda::std::atomic<int >));
55+ mResults [0 ] = mResults [1 ] = 0 ;
6256
63- launchReduceBasic<<<(N + gpuBlockSize - 1 ) / gpuBlockSize, gpuBlockSize>>> (N, mNumbers , &mResults [1 ]);
57+ launchReductionCPU<4 >(N, mNumbers , &mResults [0 ]);
58+ launchReductionGPU<<<(N + 255 ) / 256 , 256 >>> (N, mNumbers , &mResults [1 ]);
6459 cudaDeviceSynchronize ();
6560
61+ std::cout << " Reduction result CPU: " << mResults [0 ] << " \n " << std::endl;
6662 std::cout << " Reduction result GPU: " << mResults [1 ] << " \n " << std::endl;
6763
6864 cudaFree (mNumbers );
0 commit comments