CUDA-Tutorial · asimay · Aug 11, 2023 · Aug 11, 2023 · Aug 14, 2023 · asimay
diff --git a/06_MemoryBasics/src/main.cu b/06_MemoryBasics/src/main.cu
@@ -23,14 +23,14 @@ __global__ void WriteGlobalMemory(int* __restrict dOutPtr)
     *dOutPtr = dFoo * dFoo;
 }
 
-__device__ void WriteAndPrintSharedMemory(int* sFoo)
+__device__ void WriteAndPrintSharedMemory(int* __restrict sFoo)
 {
     // Write a computed result to shared memory for other threads to see
     sFoo[threadIdx.x] = 42 * (threadIdx.x + 1);
     // We make sure that no thread prints while the other still writes (parallelism!)
     __syncwarp();
     // Print own computed result and result by neighbor
-    printf("ThreadID: %d, sFoo[0]: %d, sFoo[1]: %d\n", threadIdx.x, sFoo[0], sFoo[1]);
+    printf("ThreadID: %d, sFoo[%d]: %d \n", threadIdx.x, threadIdx.x, sFoo[threadIdx.x]);
 }
 
 __global__ void WriteAndPrintSharedMemoryFixed()
@@ -73,7 +73,7 @@ int main()
      GPU memory. Can be updated with cudaMemcpyToSymbol.
      This syntax is unusual, but this is how it should be
     */
-    cudaMemcpyToSymbol(cFoo, &bar, sizeof(int));
+    cudaMemcpyToSymbol(&cFoo, &bar, sizeof(int));
     ReadConstantMemory<<<1, 1>>>();
     cudaDeviceSynchronize();
 
@@ -140,4 +140,4 @@ a syncwarp, so that other threads may fail to see it. You might need a block
 size larger than 32 threads for this to happen and you may have to let the writing 
 thread do some "fake" work to delay its write to shared memory. Or it may work
 immediately :) A solution should be provided by the following code sample.
-*/
+*/
diff --git a/08_Reductions/src/main.cu b/08_Reductions/src/main.cu
@@ -201,8 +201,8 @@ __global__ void reduceFinal(const float* __restrict input, int N)
 
     __shared__ float data[BLOCK_SIZE];
     // Already combine two values upon load from global memory.
-    data[threadIdx.x] = id < N / 2 ? input[id] : 0;
-    data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0;
+    data[threadIdx.x] = id < N ? input[id] : 0;
+    data[threadIdx.x] += (id + N < 2*N) ? input[id + N] : 0;
 
     for (int s = blockDim.x / 2; s > 16; s /= 2)
     {
@@ -312,4 +312,4 @@ Can you observe any difference in terms of speed / computed results?
 2) Do you have any other ideas how the reduction could be improved?
 Making it even faster should be quite challenging, but if you have 
 some suggestions, try them out and see how they affect performance! 
-*/
+*/