@@ -33,7 +33,7 @@ namespace impl {
3333// /{
3434#ifdef __AMDGPU__
3535
36- [[clang::loader_uninitialized]] Local<uint32_t > namedBarrierTracker;
36+ [[clang::loader_uninitialized]] static Local<uint32_t > namedBarrierTracker;
3737
3838void namedBarrierInit () {
3939 // Don't have global ctors, and shared memory is not zero init
@@ -87,34 +87,15 @@ void namedBarrier() {
8787 fence::team (atomic::release);
8888}
8989
90- void fenceTeam (atomic::OrderingTy Ordering) {
91- return __scoped_atomic_thread_fence (Ordering, atomic::workgroup);
92- }
93-
94- void fenceKernel (atomic::OrderingTy Ordering) {
95- return __scoped_atomic_thread_fence (Ordering, atomic::device);
96- }
97-
98- void fenceSystem (atomic::OrderingTy Ordering) {
99- return __scoped_atomic_thread_fence (Ordering, atomic::system);
100- }
101-
10290void syncWarp (__kmpc_impl_lanemask_t ) {
10391 // This is a no-op on current AMDGPU hardware but it is used by the optimizer
10492 // to enforce convergent behaviour between control flow graphs.
10593 __builtin_amdgcn_wave_barrier ();
10694}
10795
108- void syncThreads (atomic::OrderingTy Ordering) {
109- if (Ordering != atomic::relaxed)
110- fenceTeam (Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
111-
112- __builtin_amdgcn_s_barrier ();
113-
114- if (Ordering != atomic::relaxed)
115- fenceTeam (Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
96+ void syncThreadsAligned (atomic::OrderingTy Ordering) {
97+ synchronize::threads (Ordering);
11698}
117- void syncThreadsAligned (atomic::OrderingTy Ordering) { syncThreads (Ordering); }
11899
119100// TODO: Don't have wavefront lane locks. Possibly can't have them.
120101void unsetLock (omp_lock_t *) { __builtin_trap (); }
@@ -127,18 +108,19 @@ constexpr uint32_t UNSET = 0;
127108constexpr uint32_t SET = 1 ;
128109
129110void unsetCriticalLock (omp_lock_t *Lock) {
130- (void )atomicExchange ((uint32_t *)Lock, UNSET, atomic::acq_rel);
111+ [[maybe_unused]] uint32_t before =
112+ atomicExchange ((uint32_t *)Lock, UNSET, atomic::acq_rel);
131113}
132114
133115void setCriticalLock (omp_lock_t *Lock) {
134116 uint64_t LowestActiveThread = utils::ffs (mapping::activemask ()) - 1 ;
135117 if (mapping::getThreadIdInWarp () == LowestActiveThread) {
136- fenceKernel (atomic::release);
118+ fence::kernel (atomic::release);
137119 while (
138120 !cas ((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
139121 __builtin_amdgcn_s_sleep (32 );
140122 }
141- fenceKernel (atomic::acquire);
123+ fence::kernel (atomic::acquire);
142124 }
143125}
144126
@@ -162,34 +144,19 @@ void namedBarrier() {
162144 __nvvm_barrier_sync_cnt (BarrierNo, NumThreads);
163145}
164146
165- void fenceTeam (atomic::OrderingTy) { __nvvm_membar_cta (); }
166-
167- void fenceKernel (atomic::OrderingTy) { __nvvm_membar_gl (); }
168-
169- void fenceSystem (atomic::OrderingTy) { __nvvm_membar_sys (); }
170-
171- void syncWarp (__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync (Mask); }
172-
173- void syncThreads (atomic::OrderingTy Ordering) {
174- constexpr int BarrierNo = 8 ;
175- __nvvm_barrier_sync (BarrierNo);
176- }
177-
178147void syncThreadsAligned (atomic::OrderingTy Ordering) { __syncthreads (); }
179148
180149constexpr uint32_t OMP_SPIN = 1000 ;
181150constexpr uint32_t UNSET = 0 ;
182151constexpr uint32_t SET = 1 ;
183152
184- // TODO: This seems to hide a bug in the declare variant handling. If it is
185- // called before it is defined
186- // here the overload won't happen. Investigate lalter!
187153void unsetLock (omp_lock_t *Lock) {
188- (void )atomicExchange ((uint32_t *)Lock, UNSET, atomic::seq_cst);
154+ [[maybe_unused]] uint32_t before = atomicExchange (
155+ reinterpret_cast <uint32_t *>(Lock), UNSET, atomic::seq_cst);
189156}
190157
191158int testLock (omp_lock_t *Lock) {
192- return atomic::add (( uint32_t *) Lock, 0u , atomic::seq_cst);
159+ return atomic::add (reinterpret_cast < uint32_t *>( Lock) , 0u , atomic::seq_cst);
193160}
194161
195162void initLock (omp_lock_t *Lock) { unsetLock (Lock); }
@@ -198,8 +165,8 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
198165
199166void setLock (omp_lock_t *Lock) {
200167 // TODO: not sure spinning is a good idea here..
201- while (atomic::cas (( uint32_t *) Lock, UNSET, SET, atomic::seq_cst ,
202- atomic::seq_cst) != UNSET) {
168+ while (atomic::cas (reinterpret_cast < uint32_t *>( Lock) , UNSET, SET,
169+ atomic::seq_cst, atomic::seq_cst ) != UNSET) {
203170 int32_t start = __nvvm_read_ptx_sreg_clock ();
204171 int32_t now;
205172 for (;;) {
@@ -226,22 +193,10 @@ void synchronize::init(bool IsSPMD) {
226193 impl::namedBarrierInit ();
227194}
228195
229- void synchronize::warp (LaneMaskTy Mask) { impl::syncWarp (Mask); }
230-
231- void synchronize::threads (atomic::OrderingTy Ordering) {
232- impl::syncThreads (Ordering);
233- }
234-
235196void synchronize::threadsAligned (atomic::OrderingTy Ordering) {
236197 impl::syncThreadsAligned (Ordering);
237198}
238199
239- void fence::team (atomic::OrderingTy Ordering) { impl::fenceTeam (Ordering); }
240-
241- void fence::kernel (atomic::OrderingTy Ordering) { impl::fenceKernel (Ordering); }
242-
243- void fence::system (atomic::OrderingTy Ordering) { impl::fenceSystem (Ordering); }
244-
245200void unsetCriticalLock (omp_lock_t *Lock) { impl::unsetLock (Lock); }
246201
247202void setCriticalLock (omp_lock_t *Lock) { impl::setLock (Lock); }
@@ -328,6 +283,6 @@ void ompx_sync_block_acq_rel() {
328283 impl::syncThreadsAligned (atomic::OrderingTy::acq_rel);
329284}
330285void ompx_sync_block_divergent (int Ordering) {
331- impl::syncThreads (atomic::OrderingTy (Ordering));
286+ synchronize::threads (atomic::OrderingTy (Ordering));
332287}
333288} // extern "C"
0 commit comments