-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AutoUpgrade][AMDGPU] Adjust AS7 and AS9 address width to 48 bits #137418
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/arichardson/spr/main.autoupgradeamdgpu-adjust-as7-address-width-to-48-bits
Are you sure you want to change the base?
Conversation
Created using spr 1.3.6-beta.1
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cl,cpp,c -- clang/lib/Basic/Targets/AMDGPU.cpp clang/test/CodeGen/target-data.c clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl llvm/lib/IR/AutoUpgrade.cpp llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp View the diff from clang-format here.diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index e4fca1771..6229c8b1d 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -33,7 +33,8 @@ static const char *const DataLayoutStringR600 =
static const char *const DataLayoutStringAMDGCN =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:64-v16:16"
+ "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:64-v16:"
+ "16"
"-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
"-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8d8c0b8b5..be571117c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -686,7 +686,8 @@ static StringRef computeDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:64-v16:16-"
+ "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:"
+ "64-v16:16-"
"v24:"
"32-"
"v32:32-v48:64-v96:"
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 76748bb22..c935454c0 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -42,10 +42,12 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
// and that ANDGCN adds p7 and p8 as well.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:"
+ "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:"
+ "192:256:"
"256:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:"
+ "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:"
+ "192:256:"
"256:32");
// but that r600 does not.
EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"), "e-p:32:32-G1");
@@ -145,19 +147,25 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "G2");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
- "e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:"
+ "192:256:256:32:48");
EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
- "G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:"
+ "192:256:256:32:48");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
- "e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:"
+ "192:256:256:32:48");
// Check that AMDGCN targets don't add already declared address space 7.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:"
+ "32:48");
EXPECT_EQ(UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
- "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:"
+ "32:48");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:8:48-p9:192:256:256:"
+ "32:48");
// Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
// flag.
@@ -189,7 +197,8 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
// Check that AMDGPU targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
- "G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48");
+ "G1-ni:7:8:9-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:"
+ "256:32:48");
// Check that SPIR & SPIRV targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 4f19a6ca2..87be1f950 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -94,7 +94,8 @@ static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
}
static constexpr StringLiteral amdgcnDataLayout =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:64-v16:16"
+ "-p7:160:256:256:32:48-p8:128:128:128:8:48-p9:192:256:256:32:48-i64:64-v16:"
+ "16"
"-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
"-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
|
You'll want to get p9 as well, and maybe p8 - though p8 is a very weird kind of "pointer" that's just the resource and can't - for example - be GEP'd (Heck, part of me wants to make the index width of the buffer resource 0 - you shall not index me - but that probably breaks useful invariants somewhere) |
Just to confirm I understand this correctly, p9 also has address==48 and index==32? And p8 is the resource without an offset so it should ideally be 0. I can try setting p8 index width to zero but I imagine that will cause some interesting issues, maybe I can try using 1 instead? |
@krzysz00 I tried setting the p8 index size to 0 and this almost works but there are a handful of tests that fail. The first one is:
This fails with How would you expect ptrmask to work here? I would imagine it should use the address width and not the index width since you have valid codegen for p8. I don't know enough about amdgpu assembly so I can't tell what is actually being done here. Another test that fails is llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-ptr-add.ll which now crashes with an assertion failure inside llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll also asserts but should be easy to handle. |
So p9 is quite weird. It's used, IIRC, by, https://github.com/GPUOpen-Drivers/llpc internally to represent "structured" pointers, and so its interpretation is As to ptrmask on p8 ... if ptrmask is only supposed to affect the Index bits, then I probably shouldn't be emitting it during the p7 => p8 process and only applying a mask to the offset. But that hits the same weirdness that masking for alignment doesn't work as expected if the base pointer isn't quite aligned enough. (I can't think of many instances where that base pointer will, for example, be odd, but I'm not aware of anything saying it's not allowed. Though I'd personally be OK with documenting around that) I think most codegen - including down in LLPC - assumes the base is at least Actually, more general question on this address with stuff: x86 thread local storage. That's a form of pointer with an implicit base that can mess up your alignment checks - how's that work with ptrtoaddr? |
Thanks for all the background - how do you feel about pretending that p8 has a 48-bit index width until we figure out how to do "no indexing allowed" correctly?
Agreed, segmentation is awkward and I think we just have to assume that the segments are sufficiently aligned. I think this is somewhat outside of the LLVM abstract machine similar to page based memory. If we perform alignment checks on virtual addresses beyond 12/16/etc bits, there is no guarantee the final address will be sufficiently aligned but I view that as being outside of the LLVM semantics. I think we just have to pretend that virtual addresses or segment-relative addresses are the only thing that matters. |
This is why I've been advocating for the index-based definition of ptrtoaddr. So there's no need to do this 48-bit thing on the assumption that the base of a buffer fat pointer is "aligned enough" (in most contrxts where I've seen this the start point is some sort of allocation that'll definitely be align(16) or better) |
I'm not sure how common something like this is on AMDGPU, but for CHERI we have definitely seen cases where people want to align to the next page boundary and then using an offset relative to the start of the allocation/object does not work (those are usually only 8 byte aligned). I view |
So, The common usage pattern I'd expect for ptr addrspace(7) is to make ( Also @piotrAMD @jayfoad and anyone else they want to call in for the LLPC peerspective, since they're some of the folks who first created addrspace 7) I completely agree with you that having ptrtoaddr be relative to the relevant address space start (which may not be NULL - amdgpu p3 has null == 0xffffffff) is the way to go here: I'm not advocating for "relative to the start of the object" here. One tthing I'm trying to understand is what one would actually do with the 48-bit underlying address that your Side note, one reason I want the "value of the index bits" version of ptrtoaddr is char memory[16] = {'a', 'b', 'c', 'd', ... 'o', 'p'};
char*(7) A = __amdgcn_make_buffer_rsrc(memory, /*stride=*/0, /*length=*/8, [flags]);
char*(7) B = __amdgcn_make_buffer_rsrc(memory + 8, 0, 8, [flags]);
printf("A[8]: %c, B[0]: %c\n" A[8], B[0]); // Prints "A[8]: ^@ B[0]: h" - that is, that oob read gives 0
assert(A[8] != B[0]); // should be true, to my knowledge of C semantics - one-past-A isn't B This out of bounds behavior is one of the reasons I think that just using the bits effected by GEP as the "ptrtoaddr" result is more useful. It does mean that |
Perfect, this sounds good to me! One thing I still need to figure out is how to deal with KnownBits: right now it assumes null is all zeroes and the result is all pointer bits, but really we can only infer the value of the address based on known alignment.
If your assert is supposed to be
If you mean to dereference in the assert then anything goes since accessing out-of-bounds of buffer is UB. Another thing to note is that the C standard is quite vague on subobjects, but the way I would interpret it is that your char memory[16] = {'a', 'b', 'c', 'd', ... 'o', 'p'};
char* A = __builtin_cheri_bounds_set(memory, /*length=*/8);
char* B = __builtin_cheri_bounds_set(memory + 8, /*length=*/8);
printf("A[8]: %c, B[0]: %c\n" A[8], B[0]); // traps, the OOB read raises an exception
assert((void*)&A[8] != (void*)&B[0]); // C standard suggests this should return equal and fail the assert One reason I like the linear offset from start of the address space interpretation is that it means we can use it for the C semantics of comparing just the address |
So the main difference between the CHERI example where you're setting bounds and the buffer fat pointera is that said buffer pointers use (what I know as) SPIR-V semantics, where accesses past the bounds of the buffer have defined behavior (usually returning 0, though if you fiddle with the flags you can get 1 or sometimes an error). So, even if A and B are allocated next to each other (as there were for the example), A[8] will, by definition, return 0, while B[0], which has the same linear address, will return the value at that address. So to my eyes, the semantics that make the most sense for us are I figure some sort of meeting might be in order just in case we're violently agreeing with each other |
Created using spr 1.3.6-beta.1
Also, re the C quote (having now re-read your comment more thoroughly), I'm comfortable saying that, for the purposes of C's pointer operations, |
The buffer fat pointers contain a 48-bit address with a 32-bit offset,
so the future ptrtoaddr instruction should return a 48-bit value.
The AS9 pointers are similar to the AS7 pointers but with an additional
index field and also have a 48-bit underlying address.
Address space 8 is special in that it is just a resource reference
without a modifiable offset. For now 0-bit index width is not permitted
in LLVM and triggers assertions, so we set it to 1 byte until we can
resolve those constraints.
See https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/38