Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c4325e6

Browse files
author
Bernhard Kerbl
committed
Improved sample 18
1 parent 9272ff1 commit c4325e6

File tree

1 file changed

+22
-26
lines changed

1 file changed

+22
-26
lines changed

18_StandardLibrary/src/main.cu

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <random>
66
#include <cuda/std/atomic>
77

8-
__host__ __device__ void reduceBasic(int tId, int numThreads, int N, const int* input, cuda::std::atomic<int>* result)
8+
__host__ __device__ void reduceAtomic(int tId, int numThreads, int N, const int* input, cuda::std::atomic<int>* result)
99
{
1010
if (tId >= N)
1111
return;
@@ -15,16 +15,23 @@ __host__ __device__ void reduceBasic(int tId, int numThreads, int N, const int*
1515
int myEnd = (tId == numThreads - 1) ? N : myStart + perThread;
1616

1717
for (int i = myStart; i < myEnd; i++)
18-
{
19-
int val = input[i];
20-
result->fetch_add(val);
21-
}
18+
result->fetch_add(input[i]);
2219
}
2320

24-
__global__ void launchReduceBasic(int N, const int* input, cuda::std::atomic<int>* result)
21+
__global__ void launchReductionGPU(int N, const int* input, cuda::std::atomic<int>* result)
2522
{
2623
int tId = blockIdx.x * blockDim.x + threadIdx.x;
27-
reduceBasic(tId, N, N, input, result);
24+
reduceAtomic(tId, N, N, input, result);
25+
}
26+
27+
template<unsigned int NUM_THREADS>
28+
__host__ void launchReductionCPU(int N, int* mNumbers, cuda::std::atomic<int>* result)
29+
{
30+
std::vector<std::thread> threads(NUM_THREADS);
31+
for (int i = 0; i < threads.size(); i++)
32+
threads[i] = std::thread(reduceAtomic, i, NUM_THREADS, N, mNumbers, result);
33+
for (std::thread& t : threads)
34+
t.join();
2835
}
2936

3037
int main()
@@ -34,35 +41,24 @@ int main()
3441
of the standard in C++20. Soon!
3542
*/
3643
constexpr int N = 1<<16;
37-
constexpr int cpuNumThreads = 4;
38-
constexpr int gpuBlockSize = 256;
39-
40-
int* mNumbers;
41-
cudaMallocManaged((void**)&mNumbers, sizeof(int) * N);
4244

4345
std::default_random_engine eng(42);
4446
std::uniform_int_distribution<int> dist(10, 42);
4547

46-
cuda::std::atomic<int>* mResults;
47-
cudaMallocManaged((void**)&mResults, 2 * sizeof(cuda::std::atomic<int>));
48-
new (mResults) cuda::std::atomic<int>(0);
49-
new (mResults+1) cuda::std::atomic<int>(0);
50-
48+
int* mNumbers;
49+
cudaMallocManaged((void**)&mNumbers, sizeof(int) * N);
5150
for (int i = 0; i < N; i++)
5251
mNumbers[i] = dist(eng);
5352

54-
std::vector<std::thread> threads(cpuNumThreads);
55-
for (int i = 0; i < threads.size(); i++)
56-
threads[i] = std::thread(reduceBasic, i, cpuNumThreads, N, mNumbers, &mResults[0]);
57-
58-
for (std::thread& t : threads)
59-
t.join();
60-
61-
std::cout << "Reduction result CPU: " << mResults[0] << "\n" << std::endl;
53+
cuda::std::atomic<int>* mResults;
54+
cudaMallocManaged((void**)&mResults, 2 * sizeof(cuda::std::atomic<int>));
55+
mResults[0] = mResults[1] = 0;
6256

63-
launchReduceBasic<<<(N + gpuBlockSize - 1) / gpuBlockSize, gpuBlockSize>>>(N, mNumbers, &mResults[1]);
57+
launchReductionCPU<4>(N, mNumbers, &mResults[0]);
58+
launchReductionGPU<<<(N + 255) / 256, 256>>>(N, mNumbers, &mResults[1]);
6459
cudaDeviceSynchronize();
6560

61+
std::cout << "Reduction result CPU: " << mResults[0] << "\n" << std::endl;
6662
std::cout << "Reduction result GPU: " << mResults[1] << "\n" << std::endl;
6763

6864
cudaFree(mNumbers);

0 commit comments

Comments
 (0)