diff --git a/06_MemoryBasics/src/main.cu b/06_MemoryBasics/src/main.cu index 216cd2b..ff59bc4 100644 --- a/06_MemoryBasics/src/main.cu +++ b/06_MemoryBasics/src/main.cu @@ -23,14 +23,14 @@ __global__ void WriteGlobalMemory(int* __restrict dOutPtr) *dOutPtr = dFoo * dFoo; } -__device__ void WriteAndPrintSharedMemory(int* sFoo) +__device__ void WriteAndPrintSharedMemory(int* __restrict sFoo) { // Write a computed result to shared memory for other threads to see sFoo[threadIdx.x] = 42 * (threadIdx.x + 1); // We make sure that no thread prints while the other still writes (parallelism!) __syncwarp(); // Print own computed result and result by neighbor - printf("ThreadID: %d, sFoo[0]: %d, sFoo[1]: %d\n", threadIdx.x, sFoo[0], sFoo[1]); + printf("ThreadID: %d, sFoo[%d]: %d \n", threadIdx.x, threadIdx.x, sFoo[threadIdx.x]); } __global__ void WriteAndPrintSharedMemoryFixed() @@ -73,7 +73,7 @@ int main() GPU memory. Can be updated with cudaMemcpyToSymbol. This syntax is unusual, but this is how it should be */ - cudaMemcpyToSymbol(cFoo, &bar, sizeof(int)); + cudaMemcpyToSymbol(&cFoo, &bar, sizeof(int)); ReadConstantMemory<<<1, 1>>>(); cudaDeviceSynchronize(); @@ -140,4 +140,4 @@ a syncwarp, so that other threads may fail to see it. You might need a block size larger than 32 threads for this to happen and you may have to let the writing thread do some "fake" work to delay its write to shared memory. Or it may work immediately :) A solution should be provided by the following code sample. -*/ \ No newline at end of file +*/ diff --git a/08_Reductions/src/main.cu b/08_Reductions/src/main.cu index 52f1728..9308fd1 100644 --- a/08_Reductions/src/main.cu +++ b/08_Reductions/src/main.cu @@ -201,8 +201,8 @@ __global__ void reduceFinal(const float* __restrict input, int N) __shared__ float data[BLOCK_SIZE]; // Already combine two values upon load from global memory. - data[threadIdx.x] = id < N / 2 ? input[id] : 0; - data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0; + data[threadIdx.x] = id < N ? input[id] : 0; + data[threadIdx.x] += (id + N < 2*N) ? input[id + N] : 0; for (int s = blockDim.x / 2; s > 16; s /= 2) { @@ -312,4 +312,4 @@ Can you observe any difference in terms of speed / computed results? 2) Do you have any other ideas how the reduction could be improved? Making it even faster should be quite challenging, but if you have some suggestions, try them out and see how they affect performance! -*/ \ No newline at end of file +*/