Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b79ba02

Browse files
authored
[AMDGPU][GFX12.5] Reimplement monitor load as an atomic operation (#177343)
Load monitor operations make more sense as atomic operations, as non-atomic operations cannot be used for inter-thread communication w/o additional synchronization. The previous built-in made it work because one could just override the CPol bits, but that bypasses the memory model and forces the user to learn about ISA bits encoding. Making load monitor an atomic operation has a couple of advantages. First, the memory model foundation for it is stronger. We just lean on the existing rules for atomic operations. Second, the CPol bits are abstracted away from the user, which avoids leaking ISA details into the API. This patch also adds supporting memory model and intrinsics documentation to AMDGPUUsage. Solves SWDEV-516398.
1 parent 2ead49f commit b79ba02

24 files changed

Lines changed: 714 additions & 261 deletions

clang/include/clang/Basic/BuiltinsAMDGPU.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -712,12 +712,12 @@ def __builtin_amdgcn_s_cluster_barrier : AMDGPUBuiltin<"void()", [], "gfx1250-in
712712
def __builtin_amdgcn_flat_prefetch : AMDGPUBuiltin<"void(void const address_space<0> *, _Constant int)", [Const], "vmem-pref-insts">;
713713
def __builtin_amdgcn_global_prefetch : AMDGPUBuiltin<"void(void const address_space<1> *, _Constant int)", [Const], "vmem-pref-insts">;
714714

715-
def __builtin_amdgcn_global_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<1> *, _Constant int)", [Const], "gfx1250-insts">;
716-
def __builtin_amdgcn_global_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *, _Constant int)", [Const], "gfx1250-insts">;
717-
def __builtin_amdgcn_global_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int> address_space<1> *, _Constant int)", [Const], "gfx1250-insts">;
718-
def __builtin_amdgcn_flat_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<0> *, _Constant int)", [Const], "gfx1250-insts">;
719-
def __builtin_amdgcn_flat_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<0> *, _Constant int)", [Const], "gfx1250-insts">;
720-
def __builtin_amdgcn_flat_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int> address_space<0> *, _Constant int)", [Const], "gfx1250-insts">;
715+
def __builtin_amdgcn_global_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<1> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
716+
def __builtin_amdgcn_global_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
717+
def __builtin_amdgcn_global_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int> address_space<1> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
718+
def __builtin_amdgcn_flat_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<0> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
719+
def __builtin_amdgcn_flat_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<0> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
720+
def __builtin_amdgcn_flat_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int> address_space<0> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
721721
def __builtin_amdgcn_cluster_load_b32 : AMDGPUBuiltin<"int(int address_space<1> *, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
722722
def __builtin_amdgcn_cluster_load_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
723723
def __builtin_amdgcn_cluster_load_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int> address_space<1> *, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;

clang/include/clang/Sema/SemaAMDGPU.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,14 @@ class SemaAMDGPU : public SemaBase {
2626

2727
bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
2828

29+
/// Emits a diagnostic if the \p E is not an atomic ordering encoded in the C
30+
/// ABI format, or if the atomic ordering is not valid for the operation type
31+
/// as defined by \p MayLoad and \p MayStore. \returns true if a diagnostic
32+
/// was emitted.
33+
bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore);
34+
2935
bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
36+
bool checkAtomicMonitorLoad(CallExpr *TheCall);
3037

3138
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
3239
unsigned NumDataArgs);

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "CGBuiltin.h"
1414
#include "CodeGenFunction.h"
15+
#include "TargetInfo.h"
1516
#include "clang/Basic/DiagnosticFrontend.h"
1617
#include "clang/Basic/SyncScope.h"
1718
#include "clang/Basic/TargetBuiltins.h"
@@ -21,6 +22,7 @@
2122
#include "llvm/IR/IntrinsicsR600.h"
2223
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
2324
#include "llvm/Support/AMDGPUAddrSpace.h"
25+
#include "llvm/Support/AtomicOrdering.h"
2426

2527
using namespace clang;
2628
using namespace CodeGen;
@@ -272,6 +274,24 @@ static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
272274
return AMDGCNScope;
273275
}
274276

277+
static llvm::AtomicOrdering mapCABIAtomicOrdering(unsigned AO) {
278+
// Map C11/C++11 memory ordering to LLVM memory ordering
279+
assert(llvm::isValidAtomicOrderingCABI(AO));
280+
switch (static_cast<llvm::AtomicOrderingCABI>(AO)) {
281+
case llvm::AtomicOrderingCABI::acquire:
282+
case llvm::AtomicOrderingCABI::consume:
283+
return llvm::AtomicOrdering::Acquire;
284+
case llvm::AtomicOrderingCABI::release:
285+
return llvm::AtomicOrdering::Release;
286+
case llvm::AtomicOrderingCABI::acq_rel:
287+
return llvm::AtomicOrdering::AcquireRelease;
288+
case llvm::AtomicOrderingCABI::seq_cst:
289+
return llvm::AtomicOrdering::SequentiallyConsistent;
290+
case llvm::AtomicOrderingCABI::relaxed:
291+
return llvm::AtomicOrdering::Monotonic;
292+
}
293+
}
294+
275295
// For processing memory ordering and memory scope arguments of various
276296
// amdgcn builtins.
277297
// \p Order takes a C++11 compatible memory-ordering specifier and converts
@@ -284,25 +304,7 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
284304
int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
285305

286306
// Map C11/C++11 memory ordering to LLVM memory ordering
287-
assert(llvm::isValidAtomicOrderingCABI(ord));
288-
switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
289-
case llvm::AtomicOrderingCABI::acquire:
290-
case llvm::AtomicOrderingCABI::consume:
291-
AO = llvm::AtomicOrdering::Acquire;
292-
break;
293-
case llvm::AtomicOrderingCABI::release:
294-
AO = llvm::AtomicOrdering::Release;
295-
break;
296-
case llvm::AtomicOrderingCABI::acq_rel:
297-
AO = llvm::AtomicOrdering::AcquireRelease;
298-
break;
299-
case llvm::AtomicOrderingCABI::seq_cst:
300-
AO = llvm::AtomicOrdering::SequentiallyConsistent;
301-
break;
302-
case llvm::AtomicOrderingCABI::relaxed:
303-
AO = llvm::AtomicOrdering::Monotonic;
304-
break;
305-
}
307+
AO = mapCABIAtomicOrdering(ord);
306308

307309
// Some of the atomic builtins take the scope as a string name.
308310
StringRef scp;
@@ -818,11 +820,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
818820
break;
819821
}
820822

823+
LLVMContext &Ctx = CGM.getLLVMContext();
821824
llvm::Type *LoadTy = ConvertType(E->getType());
822825
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
823-
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
826+
827+
auto *AOExpr = cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(1)));
828+
auto *ScopeExpr = cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(2)));
829+
830+
auto Scope = static_cast<SyncScope>(ScopeExpr->getZExtValue());
831+
llvm::AtomicOrdering AO = mapCABIAtomicOrdering(AOExpr->getZExtValue());
832+
833+
StringRef ScopeStr = CGM.getTargetCodeGenInfo().getLLVMSyncScopeStr(
834+
CGM.getLangOpts(), Scope, AO);
835+
836+
llvm::MDNode *MD =
837+
llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, ScopeStr)});
838+
llvm::Value *ScopeMD = llvm::MetadataAsValue::get(Ctx, MD);
824839
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
825-
return Builder.CreateCall(F, {Addr, Val});
840+
return Builder.CreateCall(F, {Addr, AOExpr, ScopeMD});
826841
}
827842
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
828843
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:

clang/lib/CodeGen/TargetInfo.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,20 @@ LangAS TargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
148148
return D ? D->getType().getAddressSpace() : LangAS::Default;
149149
}
150150

151+
StringRef
152+
TargetCodeGenInfo::getLLVMSyncScopeStr(const LangOptions &LangOpts,
153+
SyncScope Scope,
154+
llvm::AtomicOrdering Ordering) const {
155+
return ""; /* default sync scope */
156+
}
157+
151158
llvm::SyncScope::ID
152159
TargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
153160
SyncScope Scope,
154161
llvm::AtomicOrdering Ordering,
155162
llvm::LLVMContext &Ctx) const {
156-
return Ctx.getOrInsertSyncScopeID(""); /* default sync scope */
163+
return Ctx.getOrInsertSyncScopeID(
164+
getLLVMSyncScopeStr(LangOpts, Scope, Ordering));
157165
}
158166

159167
void TargetCodeGenInfo::addStackProbeTargetAttributes(

clang/lib/CodeGen/TargetInfo.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -326,11 +326,16 @@ class TargetCodeGenInfo {
326326
return LangAS::Default;
327327
}
328328

329-
/// Get the syncscope used in LLVM IR.
330-
virtual llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
331-
SyncScope Scope,
332-
llvm::AtomicOrdering Ordering,
333-
llvm::LLVMContext &Ctx) const;
329+
/// Get the syncscope used in LLVM IR as a string
330+
virtual StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts,
331+
SyncScope Scope,
332+
llvm::AtomicOrdering Ordering) const;
333+
334+
/// Get the syncscope used in LLVM IR as a SyncScope ID.
335+
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
336+
SyncScope Scope,
337+
llvm::AtomicOrdering Ordering,
338+
llvm::LLVMContext &Ctx) const;
334339

335340
/// Allow the target to apply other metadata to an atomic instruction
336341
virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
310310
}
311311
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
312312
const VarDecl *D) const override;
313-
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
314-
SyncScope Scope,
315-
llvm::AtomicOrdering Ordering,
316-
llvm::LLVMContext &Ctx) const override;
313+
StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts, SyncScope Scope,
314+
llvm::AtomicOrdering Ordering) const override;
317315
void setTargetAtomicMetadata(CodeGenFunction &CGF,
318316
llvm::Instruction &AtomicInst,
319317
const AtomicExpr *Expr = nullptr) const override;
@@ -493,55 +491,40 @@ AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
493491
return DefaultGlobalAS;
494492
}
495493

496-
llvm::SyncScope::ID
497-
AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
498-
SyncScope Scope,
499-
llvm::AtomicOrdering Ordering,
500-
llvm::LLVMContext &Ctx) const {
501-
std::string Name;
494+
StringRef AMDGPUTargetCodeGenInfo::getLLVMSyncScopeStr(
495+
const LangOptions &LangOpts, SyncScope Scope,
496+
llvm::AtomicOrdering Ordering) const {
497+
498+
// OpenCL assumes by default that atomic scopes are per-address space for
499+
// non-sequentially consistent operations.
500+
bool IsOneAs = (Scope >= SyncScope::OpenCLWorkGroup &&
501+
Scope <= SyncScope::OpenCLSubGroup &&
502+
Ordering != llvm::AtomicOrdering::SequentiallyConsistent);
503+
502504
switch (Scope) {
503505
case SyncScope::HIPSingleThread:
504506
case SyncScope::SingleScope:
505-
Name = "singlethread";
506-
break;
507+
return IsOneAs ? "singlethread-one-as" : "singlethread";
507508
case SyncScope::HIPWavefront:
508509
case SyncScope::OpenCLSubGroup:
509510
case SyncScope::WavefrontScope:
510-
Name = "wavefront";
511-
break;
511+
return IsOneAs ? "wavefront-one-as" : "wavefront";
512512
case SyncScope::HIPCluster:
513513
case SyncScope::ClusterScope:
514-
Name = "cluster";
515-
break;
514+
return IsOneAs ? "cluster-one-as" : "cluster";
516515
case SyncScope::HIPWorkgroup:
517516
case SyncScope::OpenCLWorkGroup:
518517
case SyncScope::WorkgroupScope:
519-
Name = "workgroup";
520-
break;
518+
return IsOneAs ? "workgroup-one-as" : "workgroup";
521519
case SyncScope::HIPAgent:
522520
case SyncScope::OpenCLDevice:
523521
case SyncScope::DeviceScope:
524-
Name = "agent";
525-
break;
522+
return IsOneAs ? "agent-one-as" : "agent";
526523
case SyncScope::SystemScope:
527524
case SyncScope::HIPSystem:
528525
case SyncScope::OpenCLAllSVMDevices:
529-
Name = "";
530-
break;
531-
}
532-
533-
// OpenCL assumes by default that atomic scopes are per-address space for
534-
// non-sequentially consistent operations.
535-
if (Scope >= SyncScope::OpenCLWorkGroup &&
536-
Scope <= SyncScope::OpenCLSubGroup &&
537-
Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
538-
if (!Name.empty())
539-
Name = Twine(Twine(Name) + Twine("-")).str();
540-
541-
Name = Twine(Twine(Name) + Twine("one-as")).str();
526+
return IsOneAs ? "one-as" : "";
542527
}
543-
544-
return Ctx.getOrInsertSyncScopeID(Name);
545528
}
546529

547530
void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(

clang/lib/CodeGen/Targets/SPIR.cpp

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -131,42 +131,13 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
131131
const VarDecl *D) const override;
132132
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
133133
CodeGen::CodeGenModule &M) const override;
134-
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
135-
SyncScope Scope,
136-
llvm::AtomicOrdering Ordering,
137-
llvm::LLVMContext &Ctx) const override;
134+
StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts, SyncScope Scope,
135+
llvm::AtomicOrdering Ordering) const override;
138136
bool supportsLibCall() const override {
139137
return getABIInfo().getTarget().getTriple().getVendor() !=
140138
llvm::Triple::AMD;
141139
}
142140
};
143-
144-
inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) {
145-
switch (Scope) {
146-
case SyncScope::HIPSingleThread:
147-
case SyncScope::SingleScope:
148-
return "singlethread";
149-
case SyncScope::HIPWavefront:
150-
case SyncScope::OpenCLSubGroup:
151-
case SyncScope::WavefrontScope:
152-
return "subgroup";
153-
case SyncScope::HIPCluster:
154-
case SyncScope::ClusterScope:
155-
case SyncScope::HIPWorkgroup:
156-
case SyncScope::OpenCLWorkGroup:
157-
case SyncScope::WorkgroupScope:
158-
return "workgroup";
159-
case SyncScope::HIPAgent:
160-
case SyncScope::OpenCLDevice:
161-
case SyncScope::DeviceScope:
162-
return "device";
163-
case SyncScope::SystemScope:
164-
case SyncScope::HIPSystem:
165-
case SyncScope::OpenCLAllSVMDevices:
166-
return "";
167-
}
168-
return "";
169-
}
170141
} // End anonymous namespace.
171142

172143
void CommonSPIRABIInfo::setCCs() {
@@ -563,11 +534,32 @@ void SPIRVTargetCodeGenInfo::setTargetAttributes(
563534
llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
564535
}
565536

566-
llvm::SyncScope::ID
567-
SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
568-
llvm::AtomicOrdering,
569-
llvm::LLVMContext &Ctx) const {
570-
return Ctx.getOrInsertSyncScopeID(mapClangSyncScopeToLLVM(Scope));
537+
StringRef SPIRVTargetCodeGenInfo::getLLVMSyncScopeStr(
538+
const LangOptions &, SyncScope Scope, llvm::AtomicOrdering) const {
539+
switch (Scope) {
540+
case SyncScope::HIPSingleThread:
541+
case SyncScope::SingleScope:
542+
return "singlethread";
543+
case SyncScope::HIPWavefront:
544+
case SyncScope::OpenCLSubGroup:
545+
case SyncScope::WavefrontScope:
546+
return "subgroup";
547+
case SyncScope::HIPCluster:
548+
case SyncScope::ClusterScope:
549+
case SyncScope::HIPWorkgroup:
550+
case SyncScope::OpenCLWorkGroup:
551+
case SyncScope::WorkgroupScope:
552+
return "workgroup";
553+
case SyncScope::HIPAgent:
554+
case SyncScope::OpenCLDevice:
555+
case SyncScope::DeviceScope:
556+
return "device";
557+
case SyncScope::SystemScope:
558+
case SyncScope::HIPSystem:
559+
case SyncScope::OpenCLAllSVMDevices:
560+
return "";
561+
}
562+
return "";
571563
}
572564

573565
/// Construct a SPIR-V target extension type for the given OpenCL image type.

0 commit comments

Comments
 (0)