Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5597903

Browse files
authored
[OpenMP] Cleanup synchronization primitives (#177710)
Summary: These shouldn't be so different after we moved away from variants. It's much simpler to define this in-line with a single preprocessor definition. This should be equivalent less a few unnecessary function definitions with the advantage that SPIR-V now has less work to do.
1 parent df739ba commit 5597903

2 files changed

Lines changed: 43 additions & 63 deletions

File tree

openmp/device/include/Synchronization.h

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,18 @@ namespace synchronize {
185185
void init(bool IsSPMD);
186186

187187
/// Synchronize all threads in a warp identified by \p Mask.
188-
void warp(LaneMaskTy Mask);
188+
static inline void warp(LaneMaskTy Mask) { __gpu_sync_lane(Mask); }
189189

190190
/// Synchronize all threads in a block and perform a fence before and after the
191191
/// barrier according to \p Ordering. Note that the fence might be part of the
192192
/// barrier.
193-
void threads(atomic::OrderingTy Ordering);
193+
static inline void threads(atomic::OrderingTy Ordering) {
194+
#ifdef __NVPTX__
195+
__nvvm_barrier_sync(8);
196+
#else
197+
__gpu_sync_threads();
198+
#endif
199+
}
194200

195201
/// Synchronizing threads is allowed even if they all hit different instances of
196202
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
@@ -210,16 +216,35 @@ threadsAligned(atomic::OrderingTy Ordering);
210216

211217
} // namespace synchronize
212218

219+
// FIXME: NVPTX does not respect the memory scope argument.
213220
namespace fence {
214221

215222
/// Memory fence with \p Ordering semantics for the team.
216-
void team(atomic::OrderingTy Ordering);
223+
static inline void team(atomic::OrderingTy Ordering) {
224+
#ifdef __NVPTX__
225+
__nvvm_membar_cta();
226+
#else
227+
__scoped_atomic_thread_fence(Ordering, atomic::workgroup);
228+
#endif
229+
}
217230

218231
/// Memory fence with \p Ordering semantics for the contention group.
219-
void kernel(atomic::OrderingTy Ordering);
232+
static inline void kernel(atomic::OrderingTy Ordering) {
233+
#ifdef __NVPTX__
234+
__nvvm_membar_gl();
235+
#else
236+
__scoped_atomic_thread_fence(Ordering, atomic::device);
237+
#endif
238+
}
220239

221240
/// Memory fence with \p Ordering semantics for the system.
222-
void system(atomic::OrderingTy Ordering);
241+
static inline void system(atomic::OrderingTy Ordering) {
242+
#ifdef __NVPTX__
243+
__nvvm_membar_sys();
244+
#else
245+
__scoped_atomic_thread_fence(Ordering, atomic::system);
246+
#endif
247+
}
223248

224249
} // namespace fence
225250

openmp/device/src/Synchronization.cpp

Lines changed: 13 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace impl {
3333
///{
3434
#ifdef __AMDGPU__
3535

36-
[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
36+
[[clang::loader_uninitialized]] static Local<uint32_t> namedBarrierTracker;
3737

3838
void namedBarrierInit() {
3939
// Don't have global ctors, and shared memory is not zero init
@@ -87,34 +87,15 @@ void namedBarrier() {
8787
fence::team(atomic::release);
8888
}
8989

90-
void fenceTeam(atomic::OrderingTy Ordering) {
91-
return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
92-
}
93-
94-
void fenceKernel(atomic::OrderingTy Ordering) {
95-
return __scoped_atomic_thread_fence(Ordering, atomic::device);
96-
}
97-
98-
void fenceSystem(atomic::OrderingTy Ordering) {
99-
return __scoped_atomic_thread_fence(Ordering, atomic::system);
100-
}
101-
10290
void syncWarp(__kmpc_impl_lanemask_t) {
10391
// This is a no-op on current AMDGPU hardware but it is used by the optimizer
10492
// to enforce convergent behaviour between control flow graphs.
10593
__builtin_amdgcn_wave_barrier();
10694
}
10795

108-
void syncThreads(atomic::OrderingTy Ordering) {
109-
if (Ordering != atomic::relaxed)
110-
fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
111-
112-
__builtin_amdgcn_s_barrier();
113-
114-
if (Ordering != atomic::relaxed)
115-
fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
96+
void syncThreadsAligned(atomic::OrderingTy Ordering) {
97+
synchronize::threads(Ordering);
11698
}
117-
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
11899

119100
// TODO: Don't have wavefront lane locks. Possibly can't have them.
120101
void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -127,18 +108,19 @@ constexpr uint32_t UNSET = 0;
127108
constexpr uint32_t SET = 1;
128109

129110
void unsetCriticalLock(omp_lock_t *Lock) {
130-
(void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
111+
[[maybe_unused]] uint32_t before =
112+
atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
131113
}
132114

133115
void setCriticalLock(omp_lock_t *Lock) {
134116
uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
135117
if (mapping::getThreadIdInWarp() == LowestActiveThread) {
136-
fenceKernel(atomic::release);
118+
fence::kernel(atomic::release);
137119
while (
138120
!cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
139121
__builtin_amdgcn_s_sleep(32);
140122
}
141-
fenceKernel(atomic::acquire);
123+
fence::kernel(atomic::acquire);
142124
}
143125
}
144126

@@ -162,34 +144,19 @@ void namedBarrier() {
162144
__nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
163145
}
164146

165-
void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
166-
167-
void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
168-
169-
void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
170-
171-
void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
172-
173-
void syncThreads(atomic::OrderingTy Ordering) {
174-
constexpr int BarrierNo = 8;
175-
__nvvm_barrier_sync(BarrierNo);
176-
}
177-
178147
void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
179148

180149
constexpr uint32_t OMP_SPIN = 1000;
181150
constexpr uint32_t UNSET = 0;
182151
constexpr uint32_t SET = 1;
183152

184-
// TODO: This seems to hide a bug in the declare variant handling. If it is
185-
// called before it is defined
186-
// here the overload won't happen. Investigate lalter!
187153
void unsetLock(omp_lock_t *Lock) {
188-
(void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
154+
[[maybe_unused]] uint32_t before = atomicExchange(
155+
reinterpret_cast<uint32_t *>(Lock), UNSET, atomic::seq_cst);
189156
}
190157

191158
int testLock(omp_lock_t *Lock) {
192-
return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst);
159+
return atomic::add(reinterpret_cast<uint32_t *>(Lock), 0u, atomic::seq_cst);
193160
}
194161

195162
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
@@ -198,8 +165,8 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
198165

199166
void setLock(omp_lock_t *Lock) {
200167
// TODO: not sure spinning is a good idea here..
201-
while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
202-
atomic::seq_cst) != UNSET) {
168+
while (atomic::cas(reinterpret_cast<uint32_t *>(Lock), UNSET, SET,
169+
atomic::seq_cst, atomic::seq_cst) != UNSET) {
203170
int32_t start = __nvvm_read_ptx_sreg_clock();
204171
int32_t now;
205172
for (;;) {
@@ -226,22 +193,10 @@ void synchronize::init(bool IsSPMD) {
226193
impl::namedBarrierInit();
227194
}
228195

229-
void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
230-
231-
void synchronize::threads(atomic::OrderingTy Ordering) {
232-
impl::syncThreads(Ordering);
233-
}
234-
235196
void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
236197
impl::syncThreadsAligned(Ordering);
237198
}
238199

239-
void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
240-
241-
void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
242-
243-
void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
244-
245200
void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
246201

247202
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
@@ -328,6 +283,6 @@ void ompx_sync_block_acq_rel() {
328283
impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
329284
}
330285
void ompx_sync_block_divergent(int Ordering) {
331-
impl::syncThreads(atomic::OrderingTy(Ordering));
286+
synchronize::threads(atomic::OrderingTy(Ordering));
332287
}
333288
} // extern "C"

0 commit comments

Comments
 (0)