@@ -44,13 +44,15 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
4444 for (int k = 0 ; k < K; k += BLOCK_DIM)
4545 {
4646 s_tile_A[tid_y][tid_x] = A[ (bid_y + tid_y) * K + tid_x + k ]; // Get sub-matrix from A
47- s_tile_B[tid_y][tid_x] = B[ k * N + bid_x + tid_x ]; // Get sub-matrix from B
47+ s_tile_B[tid_y][tid_x] = B[ (k*BLOCK_DIM + tid_y) * N + bid_x + tid_x ]; // Get sub-matrix from B
4848
4949 __syncthreads ();
5050
5151 // compute gemm operation with tiles
5252 for (int e = 0 ; e < BLOCK_DIM; e++)
5353 element_c += s_tile_A[tid_y][e] * s_tile_B[e][tid_x];
54+
55+ __syncthreads ();
5456 }
5557
5658 C[(bid_y + tid_y) * N + (bid_x + tid_x)] = \
@@ -59,9 +61,9 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
5961
6062void sgemm_gold (const float *A, const float *B, float *C, int M, int N, int K, float alpha, float beta)
6163{
62- float element_c = 0 .f ;
6364 for (int row = 0 ; row < M; row++) {
6465 for (int col = 0 ; col < N; col++) {
66+ float element_c = 0 .f ;
6567 for (int e = 0 ; e < K; e++) {
6668 element_c += A[row * K + e] * B[e * N + col];
6769 }
@@ -73,7 +75,7 @@ void sgemm_gold(const float *A, const float *B, float *C, int M, int N, int K, f
7375void random_init (float *data, int length)
7476{
7577 for (int i = 0 ; i < length; i++) {
76- data[i] = (rand () & 0xFF ) / (float )RAND_MAX;
78+ data[i] = (rand () & 0xFFFF ) / (float )RAND_MAX;
7779 }
7880}
7981
@@ -167,4 +169,4 @@ int main(int c, char *argv[])
167169 free (C_gpu);
168170
169171 return 0 ;
170- }
172+ }
0 commit comments