Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1922de3

Browse files
authored
Merge pull request PacktPublishing#9 from techkang/patch-1
Thanks for pointing the bug. Fixed some index codes to have a proper vertical block-wise indexing.
2 parents cee5610 + fe09ee0 commit 1922de3

File tree

1 file changed

+6
-4
lines changed
  • Chapter07/07_parallel_programming_pattern/01_sgemm_optimization

1 file changed

+6
-4
lines changed

Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/sgemm.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,15 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
4444
for (int k = 0; k < K; k += BLOCK_DIM)
4545
{
4646
s_tile_A[tid_y][tid_x] = A[ (bid_y + tid_y) * K + tid_x + k ]; // Get sub-matrix from A
47-
s_tile_B[tid_y][tid_x] = B[ k * N + bid_x + tid_x ]; // Get sub-matrix from B
47+
s_tile_B[tid_y][tid_x] = B[ (k*BLOCK_DIM + tid_y) * N + bid_x + tid_x ]; // Get sub-matrix from B
4848

4949
__syncthreads();
5050

5151
// compute gemm operation with tiles
5252
for (int e = 0; e < BLOCK_DIM; e++)
5353
element_c += s_tile_A[tid_y][e] * s_tile_B[e][tid_x];
54+
55+
__syncthreads();
5456
}
5557

5658
C[(bid_y + tid_y) * N + (bid_x + tid_x)] = \
@@ -59,9 +61,9 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
5961

6062
void sgemm_gold(const float *A, const float *B, float *C, int M, int N, int K, float alpha, float beta)
6163
{
62-
float element_c = 0.f;
6364
for (int row = 0; row < M; row++) {
6465
for (int col = 0; col < N; col++) {
66+
float element_c = 0.f;
6567
for (int e = 0; e < K; e++) {
6668
element_c += A[row * K + e] * B[e * N + col];
6769
}
@@ -73,7 +75,7 @@ void sgemm_gold(const float *A, const float *B, float *C, int M, int N, int K, f
7375
void random_init(float *data, int length)
7476
{
7577
for (int i = 0; i < length; i++) {
76-
data[i] = (rand() & 0xFF) / (float)RAND_MAX;
78+
data[i] = (rand() & 0xFFFF) / (float)RAND_MAX;
7779
}
7880
}
7981

@@ -167,4 +169,4 @@ int main(int c, char *argv[])
167169
free(C_gpu);
168170

169171
return 0;
170-
}
172+
}

0 commit comments

Comments
 (0)