@@ -8,7 +8,6 @@ __global__ void SlowKernel()
88{
99 const long long int start = clock64 ();
1010 while ((clock64 () - start) < 1'000'000'000LL );
11-
1211}
1312
1413__device__ int dFoo;
@@ -32,8 +31,8 @@ int main()
3231 Using events to measure time and communicate across streams.
3332
3433 Expected output:
35- 1) Longer duration measured with chrono, since it includes
36- CPU-side work, which is not captured by CUDA events.
34+ 1) Unrealistically short time with chrono measurements without syncing,
35+ similar times for chrono with syncing and when using CUDA events.
3736 2) foo: 42
3837 */
3938 using namespace std ::chrono_literals;
@@ -50,34 +49,45 @@ int main()
5049
5150 // Record start directly before first relevant GPU command
5251 cudaEventRecord (start);
53- // Launch a light-weight GPU kernel
54- SetFoo<<<1 ,1 >>> (1 );
55- // Simulate some heavy CPU work
56- std::this_thread::sleep_for (2s);
57- // Launch another light-weight GPU kernel
52+ // Launch a light-weight GPU kernel and heavy GPU kernel
5853 SetFoo<<<1 ,1 >>> (0 );
54+ SlowKernel<<<1 ,1 >>> ();
5955 // Record end directly after last relevant GPU command
6056 cudaEventRecord (end);
61- /*
62- Synchronize, for two different reasons: first, to get
63- an adequate time measurement on the CPU after the GPU
64- has finished running. Second, to make sure the end
65- event has taken place when we call cudaEventElapsedTime
66- below. If we only needed to wait for the event, we
67- could use cudaEventSynchronize instead.
68- */
57+ // Also measure CPU time after last GPU command, without synching
58+ auto afterNoSync = std::chrono::system_clock::now ();
59+
60+ // Synchronize CPU and GPU
6961 cudaDeviceSynchronize ();
70- auto after = std::chrono::system_clock::now ();
62+ // Measure CPU time after last GPU command, with synching
63+ auto afterSync = std::chrono::system_clock::now ();
64+
65+ // Print measured CPU time without synchronization
66+ float msCPUNoSync = 1000 .f * duration_cast<duration<float >>(afterNoSync - before).count ();
67+ std::cout << " Measured time (chrono, no sync): " << msCPUNoSync << " ms\n " ;
68+
69+ // Print measured CPU time with synchronization
70+ float msCPUSync = 1000 .f * duration_cast<duration<float >>(afterSync - before).count ();
71+ std::cout << " Measured time (chrono, sync): " << msCPUSync << " ms\n " ;
7172
72- // Print measured CPU time - includes work done by the CPU inbetween
73- float msCPU = 1000 .f * duration_cast<duration<float >>(after - before).count ();
74- std::cout << " Measured time (chrono): " << msCPU << " ms\n " ;
75-
7673 // Print measured GPU time - duration of GPU tasks only
7774 float msGPU;
7875 cudaEventElapsedTime (&msGPU, start, end);
7976 std::cout << " Measured time (CUDA events): " << msGPU << " ms\n " ;
8077
78+ /*
79+ The difference between the two methods, CPU timing and events, is
80+ important when writing more complex projects: kernels are being
81+ launched asynchronously. The launch returns immediately so the CPU
82+ can progress with other jobs. This means that to get a proper timing,
83+ we always have to synchronize CPU and GPU before measuing current time
84+ with chrono. With CUDA events, we can insert them into streams before
85+ and after the actions we want to measure. We can have multiple events
86+ inserted at many different points. We still have to synchronize, but
87+ only when we eventually want to ACCESS the measurements on the CPU
88+ (e.g., once for all timings at the end of a frame to get a report).
89+ */
90+
8191 // Clean up events
8292 cudaEventDestroy (start);
8393 cudaEventDestroy (end);
0 commit comments