Merge pull request PacktPublishing#9 from techkang/patch-1

haanjack · web-flow · commit 1922de3f1b41 · 2020-08-03T17:11:14.000+09:00
Thanks for pointing the bug. 
Fixed some index codes to have a proper vertical block-wise indexing.
diff --git a/Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/sgemm.cu b/Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/sgemm.cu
@@ -44,13 +44,15 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
     for (int k = 0; k < K; k += BLOCK_DIM)
     {
         s_tile_A[tid_y][tid_x] = A[ (bid_y + tid_y) * K + tid_x + k ]; // Get sub-matrix from A
-        s_tile_B[tid_y][tid_x] = B[ k * N + bid_x + tid_x ]; // Get sub-matrix from B
+        s_tile_B[tid_y][tid_x] = B[ (k*BLOCK_DIM + tid_y) * N + bid_x + tid_x ]; // Get sub-matrix from B
 
         __syncthreads();
 
         // compute gemm operation with tiles
         for (int e = 0; e < BLOCK_DIM; e++)
             element_c += s_tile_A[tid_y][e] * s_tile_B[e][tid_x];
+	    
+	__syncthreads();
     }
 
     C[(bid_y + tid_y) * N + (bid_x + tid_x)] = \
@@ -59,9 +61,9 @@ __global__ void sgemm_kernel_v2(const float *A, const float *B, float *C, int M,
 
 void sgemm_gold(const float *A, const float *B, float *C, int M, int N, int K, float alpha, float beta)
 {
-    float element_c = 0.f;
     for (int row = 0; row < M; row++) {
         for (int col = 0; col < N; col++) {
+	    float element_c = 0.f;
             for (int e = 0; e < K; e++) {
                 element_c += A[row * K + e] * B[e * N + col];
 	        }
@@ -73,7 +75,7 @@ void sgemm_gold(const float *A, const float *B, float *C, int M, int N, int K, f
 void random_init(float *data, int length)
 {
     for (int i = 0; i < length; i++) {
-        data[i] = (rand() & 0xFF) / (float)RAND_MAX;
+        data[i] = (rand() & 0xFFFF) / (float)RAND_MAX;
     }
 }
 
@@ -167,4 +169,4 @@ int main(int c, char *argv[])
     free(C_gpu);
 
     return 0;
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -44,13 +44,15 @@ __global__ void sgemm_kernel_v2(const float A, const float B, float *C, int M,`
`44`	`44`	`for (int k = 0; k < K; k += BLOCK_DIM)`
`45`	`45`	`{`
`46`	`46`	`s_tile_A[tid_y][tid_x] = A[ (bid_y + tid_y) * K + tid_x + k ]; // Get sub-matrix from A`
`47`		`- s_tile_B[tid_y][tid_x] = B[ k * N + bid_x + tid_x ]; // Get sub-matrix from B`
	`47`	`+ s_tile_B[tid_y][tid_x] = B[ (kBLOCK_DIM + tid_y) N + bid_x + tid_x ]; // Get sub-matrix from B`
`48`	`48`
`49`	`49`	`__syncthreads();`
`50`	`50`
`51`	`51`	`// compute gemm operation with tiles`
`52`	`52`	`for (int e = 0; e < BLOCK_DIM; e++)`
`53`	`53`	`element_c += s_tile_A[tid_y][e] * s_tile_B[e][tid_x];`
	`54`	`+`
	`55`	`+ __syncthreads();`
`54`	`56`	`}`
`55`	`57`
`56`	`58`	`C[(bid_y + tid_y) * N + (bid_x + tid_x)] = \`
`@@ -59,9 +61,9 @@ __global__ void sgemm_kernel_v2(const float A, const float B, float *C, int M,`
`59`	`61`
`60`	`62`	`void sgemm_gold(const float A, const float B, float *C, int M, int N, int K, float alpha, float beta)`
`61`	`63`	`{`
`62`		`- float element_c = 0.f;`
`63`	`64`	`for (int row = 0; row < M; row++) {`
`64`	`65`	`for (int col = 0; col < N; col++) {`
	`66`	`+ float element_c = 0.f;`
`65`	`67`	`for (int e = 0; e < K; e++) {`
`66`	`68`	`element_c += A[row * K + e] * B[e * N + col];`
`67`	`69`	`}`
`@@ -73,7 +75,7 @@ void sgemm_gold(const float A, const float B, float *C, int M, int N, int K, f`
`73`	`75`	`void random_init(float *data, int length)`
`74`	`76`	`{`
`75`	`77`	`for (int i = 0; i < length; i++) {`
`76`		`- data[i] = (rand() & 0xFF) / (float)RAND_MAX;`
	`78`	`+ data[i] = (rand() & 0xFFFF) / (float)RAND_MAX;`
`77`	`79`	`}`
`78`	`80`	`}`
`79`	`81`
`@@ -167,4 +169,4 @@ int main(int c, char *argv[])`
`167`	`169`	`free(C_gpu);`
`168`	`170`
`169`	`171`	`return 0;`
`170`		`-}`
	`172`	`+}`