<div class="fragment"><div class="line"> 1: #include <taskflow/taskflow.hpp></div><div class="line"> 2: </div><div class="line"> 3: <span class="comment">// saxpy (single-precision A·X Plus Y) kernel</span></div><div class="line"> 4: __global__ <span class="keywordtype">void</span> saxpy(<span class="keywordtype">int</span> n, <span class="keywordtype">int</span> a, <span class="keywordtype">int</span> *x, <span class="keywordtype">int</span> *y, <span class="keywordtype">int</span> *z) {</div><div class="line"> 5: <span class="keywordtype">int</span> i = blockIdx.x*blockDim.x + threadIdx.x;</div><div class="line"> 6: <span class="keywordflow">if</span> (i < n) {</div><div class="line"> 7: z[i] = a*x[i] + y[i];</div><div class="line"> 8: }</div><div class="line"> 9: }</div><div class="line">10:</div><div class="line">11: <span class="keywordtype">int</span> main() {</div><div class="line">12:</div><div class="line">13: <span class="keyword">const</span> <span class="keywordtype">unsigned</span> N = 1<<20;</div><div class="line">14: </div><div class="line">15: <span class="keywordtype">int</span>* dx {<span class="keyword">nullptr</span>};</div><div class="line">16: <span class="keywordtype">int</span>* dy {<span class="keyword">nullptr</span>};</div><div class="line">17: <span class="keywordtype">int</span>* z1 {<span class="keyword">nullptr</span>};</div><div class="line">18: <span class="keywordtype">int</span>* z2 {<span class="keyword">nullptr</span>};</div><div class="line">19: </div><div class="line">20: cudaMallocManaged(&dx, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>)); <span class="comment">// create a unified memory block for x</span></div><div class="line">21: cudaMallocManaged(&dy, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>)); <span class="comment">// create a unified memory block for y</span></div><div class="line">22: cudaMallocManaged(&z1, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>)); <span class="comment">// result of saxpy task 1</span></div><div class="line">23: cudaMallocManaged(&z2, N*<span class="keyword">sizeof</span>(<span class="keywordtype">int</span>)); <span class="comment">// result of saxpy task 2</span></div><div class="line">24: </div><div class="line">25: <span class="keywordflow">for</span>(<span class="keywordtype">unsigned</span> i=0; i<N; ++i) {</div><div class="line">26: dx[i] = 1;</div><div class="line">27: dy[i] = 2;</div><div class="line">28: }</div><div class="line">29:</div><div class="line">30: <a class="code" href="classtf_1_1Taskflow.html">tf::Taskflow</a> taskflow;</div><div class="line">31: <a class="code" href="classtf_1_1Executor.html">tf::Executor</a> executor;</div><div class="line">32: </div><div class="line">33: taskflow.<a class="code" href="classtf_1_1FlowBuilder.html#a796e29175380f70246cf2a5639adc437">emplace</a>([&](<a class="code" href="classtf_1_1cudaFlow.html">tf::cudaFlow</a>& cf){</div><div class="line">34: <span class="comment">// launch the cudaFlow on GPU 0</span></div><div class="line">35: cf.<a class="code" href="classtf_1_1cudaFlow.html#ad8c0664e4dc3748f043eaa31b69c11cc">device</a>(0);</div><div class="line">36:</div><div class="line">37: <span class="comment">// launch the first saxpy kernel on GPU 1</span></div><div class="line">38: cf.<a class="code" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b">kernel_on</a>(1, (N+255)/256, 256, 0, saxpy, N, 2, dx, dy, z1);</div><div class="line">39:</div><div class="line">40: <span class="comment">// launch the second saxpy kernel on GPU 3</span></div><div class="line">41: cf.<a class="code" href="classtf_1_1cudaFlow.html#a4a839dbaa01237a440edfebe8faf4e5b">kernel_on</a>(3, (N+255)/256, 256, 0, saxpy, N, 2, dx, dy, z2);</div><div class="line">42: });</div><div class="line">43:</div><div class="line">44: executor.<a class="code" href="classtf_1_1Executor.html#a81f35d5b0a20ac0646447eb80d97c0aa">run</a>(taskflow).wait();</div><div class="line">45:</div><div class="line">46: cudaFree(dx);</div><div class="line">47: cudaFree(dy);</div><div class="line">48: </div><div class="line">49: <span class="comment">// verify the solution; max_error should be zero</span></div><div class="line">50: <span class="keywordtype">int</span> max_error = 0;</div><div class="line">51: <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> i = 0; i < N; i++) {</div><div class="line">52: max_error = <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/algorithm/max.html">std::max</a>(max_error, abs(z1[i]-4));</div><div class="line">53: max_error = <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/algorithm/max.html">std::max</a>(max_error, abs(z2[i]-4));</div><div class="line">54: }</div><div class="line">55: <a class="codeRef" doxygen="/home/tsung-wei/Code/taskflow/doxygen/cppreference-doxygen-web.tag.xml:http://en.cppreference.com/w/" href="http://en.cppreference.com/w/cpp/io/basic_ostream.html">std::cout</a> << <span class="stringliteral">"saxpy finished with max error: "</span> << max_error << <span class="charliteral">'\n'</span>;</div><div class="line">56: }</div></div><!-- fragment --><p>Debrief:</p>
0 commit comments