code-in-cpp
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/chapter6.html‎
Lines changed: 3 additions & 3 deletions b/‎docs/chapter6.html‎
Lines changed: 3 additions & 3 deletions
@@ -732,7 +732,6 @@ target_link_libraries(
 )
 target_include_directories(cuda_basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest)
 set(TF_UTEST_CUDA_BASICS ${TF_UTEST_DIR}/cuda/cuda_basics)
-add_test(cuda_basics.builder ${TF_UTEST_CUDA_BASICS} -tc=Builder)
 add_test(cuda_basics.worker_id.1C1G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C1G)
 add_test(cuda_basics.worker_id.1C2G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C2G)
 add_test(cuda_basics.worker_id.1C3G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C3G)
 
@@ -114,7 +114,7 @@ <h1><a class="anchor" id="C6_configure_the_number_of_gpu_workers"></a>
 <h1><a class="anchor" id="C6_run_a_cudaflow_on_multiple_gpus"></a>
 Run a cudaFlow on Multiple GPUs</h1>
 <p>You can run a cudaFlow on multiple GPUs by explicitly associating a cudaFlow or a kernel task with a CUDA device. A CUDA device is an integer number in the range of <code>[0, N)</code> representing the identifier of a GPU, where <code>N</code> is the number of GPUs in a system. The code below creates a cudaFlow that runs on the GPU device 2 through <code>my_stream</code>.</p>
-<div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf) {</div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#ad8c0664e4dc3748f043eaa31b69c11cc">device</a>(2);</div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#a5ccc24918db4d63c50f26b68d17fd452">stream</a>(my_stream);  <span class="comment">// by default, a cudaFlow runs on a per-worker stream managed by the executor</span></div><div class="line">  <span class="comment">// adding more cudaTasks below (all tasks are placed on GPU 2 unless specified explicitly)</span></div><div class="line">});</div></div><!-- fragment --><p>You can place a kernel on a device explicitly through the method <a class="el" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b" title="creates a kernel task on a device ">tf::cudaFlow::kernel_on</a> that takes the device identifier in the first argument.</p>
+<div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf) {</div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#ad8c0664e4dc3748f043eaa31b69c11cc">device</a>(2);</div><div class="line">  cf.stream(my_stream);  <span class="comment">// by default, a cudaFlow runs on a per-worker stream managed by the executor</span></div><div class="line">  <span class="comment">// adding more cudaTasks below (all tasks are placed on GPU 2 unless specified explicitly)</span></div><div class="line">});</div></div><!-- fragment --><p>You can place a kernel on a device explicitly through the method <a class="el" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b" title="creates a kernel task on a device ">tf::cudaFlow::kernel_on</a> that takes the device identifier in the first argument.</p>
 <div class="fragment"><div class="line"> 1: #include &lt;taskflow/taskflow.hpp&gt;</div><div class="line"> 2: </div><div class="line"> 3: <span class="comment">// saxpy (single-precision A·X Plus Y) kernel</span></div><div class="line"> 4: __global__ <span class="keywordtype">void</span> saxpy(<span class="keywordtype">int</span> n, <span class="keywordtype">int</span> a, <span class="keywordtype">int</span> *x, <span class="keywordtype">int</span> *y, <span class="keywordtype">int</span> *z) {</div><div class="line"> 5:  <span class="keywordtype">int</span> i = blockIdx.x*blockDim.x + threadIdx.x;</div><div class="line"> 6:  <span class="keywordflow">if</span> (i &lt; n) {</div><div class="line"> 7:    z[i] = a*x[i] + y[i];</div><div class="line"> 8:   }</div><div class="line"> 9: }</div><div class="line">10:</div><div class="line">11: <span class="keywordtype">int</span> main() {</div><div class="line">12:</div><div class="line">13:   <span class="keyword">const</span> <span class="keywordtype">unsigned</span> N = 1&lt;&lt;20;</div><div class="line">14:   </div><div class="line">15:   <span class="keywordtype">int</span>* dx {<span class="keyword">nullptr</span>};</div><div class="line">16:   <span class="keywordtype">int</span>* dy {<span class="keyword">nullptr</span>};</div><div class="line">17:   <span class="keywordtype">int</span>* z1 {<span class="keyword">nullptr</span>};</div><div class="line">18:   <span class="keywordtype">int</span>* z2 {<span class="keyword">nullptr</span>};</div><div class="line">19:  </div><div class="line">20:   cudaMallocManaged(&amp;dx, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>));  <span class="comment">// create a unified memory block for x</span></div><div class="line">21:   cudaMallocManaged(&amp;dy, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>));  <span class="comment">// create a unified memory block for y</span></div><div class="line">22:   cudaMallocManaged(&amp;z1, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>));  <span class="comment">// result of saxpy task 1</span></div><div class="line">23:   cudaMallocManaged(&amp;z2, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>));  <span class="comment">// result of saxpy task 2</span></div><div class="line">24:  </div><div class="line">25:   <span class="keywordflow">for</span>(<span class="keywordtype">unsigned</span> i=0; i&lt;N; ++i) {</div><div class="line">26:     dx[i] = 1;</div><div class="line">27:     dy[i] = 2;</div><div class="line">28:   }</div><div class="line">29:</div><div class="line">30:   <a class="code" href="classtf_1_1Taskflow.html">tf::Taskflow</a> taskflow;</div><div class="line">31:   <a class="code" href="classtf_1_1Executor.html">tf::Executor</a> executor;</div><div class="line">32:  </div><div class="line">33:   taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&amp;](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf){</div><div class="line">34:     <span class="comment">// launch the cudaFlow on GPU 0</span></div><div class="line">35:     cf.<a class="code" href="classtf_1_1cudaFlow.html#ad8c0664e4dc3748f043eaa31b69c11cc">device</a>(0);</div><div class="line">36:</div><div class="line">37:     <span class="comment">// launch the first saxpy kernel on GPU 1</span></div><div class="line">38:     cf.<a class="code" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b">kernel_on</a>(1, (N+255)/256, 256, 0, saxpy, N, 2, dx, dy, z1);</div><div class="line">39:</div><div class="line">40:     <span class="comment">// launch the second saxpy kernel on GPU 3</span></div><div class="line">41:     cf.<a class="code" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b">kernel_on</a>(3, (N+255)/256, 256, 0, saxpy, N, 2, dx, dy, z2);</div><div class="line">42:   });</div><div class="line">43:</div><div class="line">44:   executor.<a class="code" href="classtf_1_1Executor.html#a81f35d5b0a20ac0646447eb80d97c0aa">run</a>(taskflow).wait();</div><div class="line">45:</div><div class="line">46:   cudaFree(dx);</div><div class="line">47:   cudaFree(dy);</div><div class="line">48:  </div><div class="line">49:   <span class="comment">// verify the solution; max_error should be zero</span></div><div class="line">50:   <span class="keywordtype">int</span> max_error = 0;</div><div class="line">51:   <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> i = 0; i &lt; N; i++) {</div><div class="line">52:     max_error = <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/algorithm/max.html">std::max</a>(max_error, abs(z1[i]-4));</div><div class="line">53:     max_error = <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/algorithm/max.html">std::max</a>(max_error, abs(z2[i]-4));</div><div class="line">54:   }</div><div class="line">55:   <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/io/basic_ostream.html">std::cout</a> &lt;&lt; <span class="stringliteral">&quot;saxpy finished with max error: &quot;</span> &lt;&lt; max_error &lt;&lt; <span class="charliteral">&#39;\n&#39;</span>;</div><div class="line">56: }</div></div><!-- fragment --><p>Debrief:</p>
 <ul>
 <li>Lines 3-9 define a CUDA saxpy kernel that stores the result to z </li>
@@ -136,8 +136,8 @@ <h1><a class="anchor" id="C6_run_a_cudaflow_on_multiple_gpus"></a>
 <div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&amp;](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf){</div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#aee1fa4aff12a41737ea585fa2e106a35">fill</a>(target, 1234, count);</div><div class="line">});</div></div><!-- fragment --><p>Similar concept applies to <a class="el" href="classtf_1_1cudaFlow.html#ad37637606f0643f360e9eda1f9a6e559" title="creates a memcpy task ">cudaFlow::memcpy</a> and <a class="el" href="classtf_1_1cudaFlow.html#af03e04771b655f9e629eb4c22e19b19f" title="creates a copy task ">cudaFlow::copy</a> as well.</p>
 <div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&amp;](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf){</div><div class="line">  <a class="code" href="classtf_1_1cudaTask.html">tf::cudaTask</a> memcpy_target = cf.<a class="code" href="classtf_1_1cudaFlow.html#ad37637606f0643f360e9eda1f9a6e559">memcpy</a>(target, source, <span class="keyword">sizeof</span>(<span class="keywordtype">int</span>) * count);</div><div class="line">  <a class="code" href="classtf_1_1cudaTask.html">tf::cudaTask</a> same_as_above = cf.<a class="code" href="classtf_1_1cudaFlow.html#af03e04771b655f9e629eb4c22e19b19f">copy</a>(target, source, count);</div><div class="line">});</div></div><!-- fragment --><h1><a class="anchor" id="C6_LaunchcudaFlowRepeatedly"></a>
 Iterate a cudaFlow</h1>
-<p>You can create a cudaFlow once and launch it multiple times using <a class="el" href="classtf_1_1cudaFlow.html#a1eeebb4bbd6436a3145ff950ce282ac4" title="repeats the execution of the cudaFlow by n times ">cudaFlow::repeat</a> or <a class="el" href="classtf_1_1cudaFlow.html#adbd46a1ef9f5ae9e0848ccbefa1e65ee" title="assigns a predicate to loop the cudaFlow until the predicate is satisfied ">cudaFlow::predicate</a>, given that the graph parameters remain <em>unchanged</em> across all iterations.</p>
-<div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&amp;] (<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf) {</div><div class="line">  <span class="comment">// construct the GPU task dependency graph ...</span></div><div class="line">  </div><div class="line">  <span class="comment">// launch the cudaFlow 10 times</span></div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#a1eeebb4bbd6436a3145ff950ce282ac4">repeat</a>(10);</div><div class="line"></div><div class="line">  <span class="comment">// equivalently</span></div><div class="line">  cf.<a class="code" href="classtf_1_1cudaFlow.html#adbd46a1ef9f5ae9e0848ccbefa1e65ee">predicate</a>([n=10] () <span class="keyword">mutable</span> { <span class="keywordflow">return</span> n-- == 0; });</div><div class="line">});</div></div><!-- fragment --><p>The executor iterate the execution of the cudaFlow until the predicate evaluates to <code>true</code>.</p>
+<p>You can create a cudaFlow once and launch it multiple times using cudaFlow::repeat or cudaFlow::predicate, given that the graph parameters remain <em>unchanged</em> across all iterations.</p>
+<div class="fragment"><div class="line">taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&amp;] (<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>&amp; cf) {</div><div class="line">  <span class="comment">// construct the GPU task dependency graph ...</span></div><div class="line">  </div><div class="line">  <span class="comment">// launch the cudaFlow 10 times</span></div><div class="line">  cf.repeat(10);</div><div class="line"></div><div class="line">  <span class="comment">// equivalently</span></div><div class="line">  cf.predicate([n=10] () <span class="keyword">mutable</span> { <span class="keywordflow">return</span> n-- == 0; });</div><div class="line">});</div></div><!-- fragment --><p>The executor iterate the execution of the cudaFlow until the predicate evaluates to <code>true</code>.</p>
 <h1><a class="anchor" id="C6_Granularity"></a>
 Granularity</h1>
 <p>Creating a cudaFlow has certain overhead, which means fined-grained tasking such as one GPU operation per cudaFlow may not give you any performance gain. You should aggregate as many GPU operations as possible in a cudaFlow to launch the entire graph once instead of separate calls. For example, the following code creates the saxpy task graph at a very fine-grained level using one cudaFlow per GPU operation.</p>
Original file line number	Diff line number	Diff line change
`@@ -732,7 +732,6 @@ target_link_libraries(`
`732`	`732`	`)`
`733`	`733`	`target_include_directories(cuda_basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest)`
`734`	`734`	`set(TF_UTEST_CUDA_BASICS ${TF_UTEST_DIR}/cuda/cuda_basics)`
`735`		`-add_test(cuda_basics.builder ${TF_UTEST_CUDA_BASICS} -tc=Builder)`
`736`	`735`	`add_test(cuda_basics.worker_id.1C1G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C1G)`
`737`	`736`	`add_test(cuda_basics.worker_id.1C2G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C2G)`
`738`	`737`	`add_test(cuda_basics.worker_id.1C3G ${TF_UTEST_CUDA_BASICS} -tc=WorkerID.1C3G)`