@@ -61,7 +61,7 @@ int main()
6161 In this sample, we demonstrate the very common application of
6262 matrix-matrix multiplication. To illustrate the benefits of
6363 tensor cores, we run it in four different ways:
64- 1) Using tiling and shared memory (CUDA Progrraming Guide)
64+ 1) Using tiling and shared memory (CUDA Programming Guide)
6565 2) Using tiling and tensor cores
6666 3) Using CUBLAS without tensor cores
6767 4) Using CUBLAS with tensor cores
@@ -73,7 +73,11 @@ int main()
7373 */
7474
7575 constexpr unsigned int DIM = 4096 ;
76- std::cout << " Multiplying two " << DIM << " x " << DIM << " matrices on GPU\n " << std::endl;
76+ std::cout << " Timing " << DIM << " x " << DIM << " matrix-matrix multiplication on GPU with 4 different methods:" << std::endl;
77+ std::cout << " 1) Reference (CUDA Programming Guide)" << std::endl;
78+ std::cout << " 2) Tensor cores (naive)" << std::endl;
79+ std::cout << " 3) Pedantic (CUBLAS)" << std::endl;
80+ std::cout << " 4) Tensor cores (CUBLAS)\n " << std::endl;
7781
7882 // To use CUBLAS functions, we initiate a handle once
7983 cublasHandle_t handle;
@@ -114,7 +118,7 @@ int main()
114118 Tensor Cores, and two CUBLAS methods, one with Tensor Cores diabled,
115119 the other enabled.
116120 */
117- enum class METHOD { REF, TENSOR, CUBLAS_NO_TENSOR, CUBLAS};
121+ enum class METHOD { REF = 1 , TENSOR = 2 , CUBLAS_NO_TENSOR = 3 , CUBLAS = 4 };
118122 for (METHOD m : {METHOD::REF, METHOD::TENSOR, METHOD::CUBLAS_NO_TENSOR, METHOD::CUBLAS})
119123 {
120124 // Initiatlize the output matrix
@@ -162,7 +166,7 @@ int main()
162166 // Synchronize and report run time of each individual technique
163167 float ms;
164168 cudaEventElapsedTime (&ms, start, end);
165- std::cout << ms << " ms\n " << std::endl;
169+ std::cout << ( int )m << " ) " << ms << " ms" << std::endl;
166170 }
167171 // Destroy acquired CUBLAS handle
168172 cublasDestroy (handle);
0 commit comments