Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2d0abbe

Browse files
authored
Merge pull request PacktPublishing#14 from dleunji/master
Fix tiling index and typo
2 parents a233a86 + 97d38f4 commit 2d0abbe

File tree

1 file changed

+2
-2
lines changed
  • Chapter07/07_parallel_programming_pattern/01_sgemm_optimization

1 file changed

+2
-2
lines changed

Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/sgemm.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
4444
for (int k = 0; k < K; k += BLOCK_DIM)
4545
{
4646
s_tile_A[tid_y][tid_x] = A[ (bid_y + tid_y) * K + tid_x + k ]; // Get sub-matrix from A
47-
s_tile_B[tid_y][tid_x] = B[ (k*BLOCK_DIM + tid_y) * N + bid_x + tid_x ]; // Get sub-matrix from B
47+
s_tile_B[tid_y][tid_x] = B[ (k + tid_y) * N + bid_x + tid_x ]; // Get sub-matrix from B
4848

4949
__syncthreads();
5050

@@ -122,7 +122,7 @@ int main(int c, char *argv[])
122122

123123
// copy initial value for gpu memory
124124
cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice);
125-
cudaMemcpy(d_B, A, K * N * sizeof(float), cudaMemcpyHostToDevice);
125+
cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice);
126126

127127
// do operation
128128
dim3 blockDim(BLOCK_DIM, BLOCK_DIM);

0 commit comments

Comments
 (0)