@@ -153,23 +153,36 @@ __device__ void checked_signal(
153153 const int v1, const int v2, const int v3, const int v4
154154 )
155155{
156- if (blockIdx .x == 0 ) {
157- register int r1, r2, r3, r4;
158- if (threadIdx .x == 0 ) {
159- // wait for top neighbor to clear bottom signal (indicating ready for new input)
160- do {
161- asm volatile (" ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : " =r" (r1), " =r" (r2), " =r" (r3), " =r" (r4) : " l" (signal1_flag) : " memory" );
162- } while (r1 == v1 && r2 == v2 && r3 == v3 && r4 == v4);
163- // signal to top neighbor my output is ready
164- asm volatile (" st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: " l" (signal1_flag), " r" (v1), " r" (v2), " r" (v3), " r" (v4) : " memory" );
165- } else if (threadIdx .x == 1 ) {
166- // wait for bottom neighbor to clear top signal (indicating ready for new input)
156+ cg::this_grid ().sync ();
157+ bool is_main_thread = (blockIdx .x == 0 && threadIdx .x == 0 ) ? true : false ;
158+ if (is_main_thread) {
159+ // flush all writes to global memory
160+ __threadfence_system ();
161+ // wait for top or bottom neighbor to clear signal
162+ register int r1, r2, r3, r4;
163+ bool top_zeroed=false , btm_zeroed=false , top_done=false , btm_done=false ;
164+ do {
167165 do {
168- asm volatile (" ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : " =r" (r1), " =r" (r2), " =r" (r3), " =r" (r4) : " l" (signal2_flag) : " memory" );
169- } while (r1 == v1 && r2 == v2 && r3 == v3 && r4 == v4);
170- // signal to bottom neighbor my output is ready
171- asm volatile (" st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: " l" (signal2_flag), " r" (v1), " r" (v2), " r" (v3), " r" (v4) : " memory" );
172- }
166+ if (!top_zeroed) {
167+ asm volatile (" ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : " =r" (r1), " =r" (r2), " =r" (r3), " =r" (r4) : " l" (signal1_flag) : " memory" );
168+ if (r1 != v1 || r2 != v2 || r3 != v3 || r4 != v4) top_zeroed = true ;
169+ }
170+ if (!btm_zeroed) {
171+ asm volatile (" ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : " =r" (r1), " =r" (r2), " =r" (r3), " =r" (r4) : " l" (signal2_flag) : " memory" );
172+ if (r1 != v1 || r2 != v2 || r3 != v3 || r4 != v4) btm_zeroed = true ;
173+ }
174+ } while ((top_zeroed == top_done) && (btm_zeroed == btm_done));
175+ if (!top_done && top_zeroed) {
176+ // signal to top neighbor my output is ready
177+ asm volatile (" st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: " l" (signal1_flag), " r" (v1), " r" (v2), " r" (v3), " r" (v4) : " memory" );
178+ top_done = true ;
179+ }
180+ if (!btm_done && btm_zeroed) {
181+ // signal to bottom neighbor my output is ready
182+ asm volatile (" st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: " l" (signal2_flag), " r" (v1), " r" (v2), " r" (v3), " r" (v4) : " memory" );
183+ btm_done = true ;
184+ }
185+ } while (!top_done || !btm_done);
173186 }
174187}
175188
0 commit comments