From 4e9d25f308ecd8a7a45353645121d2db85317fd3 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 20 Aug 2025 20:24:17 +0000 Subject: [PATCH 01/11] Move create_tdesc addr shape restriction to .td, match pointer type for scatter ops with other xegpu pointer usage. --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 5 +++-- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 ++++-- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 4 ---- mlir/test/Dialect/XeGPU/invalid.mlir | 6 +++--- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index eb54d6887681d..ac11210c6d0c2 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -500,7 +500,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { (scattered) subviews, allowing each work-item in a subgroup specifying their own offset. It accepts the following parameters: - * source: a 1D memref or pointer (uint64_t) represents the flattened memory object. + * source: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened + memory object. * offsets: a vector containing offsets of each access point. Its size is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, implying each element in the vector corresponds to a work-item (SIMT lane) @@ -536,7 +537,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { ``` }]; - let arguments = (ins XeGPU_BaseAddrType: $source, + let arguments = (ins XeGPU_GatherScatterBaseAddrType: $source, XeGPU_OffsetType: $offsets); let results = (outs XeGPU_TensorDesc:$TensorDesc); diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index f8b371db498e8..53ecedab5406d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -16,13 +16,15 @@ include "mlir/IR/BuiltinTypes.td" def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>; -def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>; +def XeGPU_PointerType: AnyTypeOf<[UI64, UI32, I64, I32]>; +def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>; def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>; def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>; def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>; def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; +def XeGPU_GatherScatterBaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], @@ -189,7 +191,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", let genVerifyDecl = 1; } -def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64]>; +def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,XeGPU_GatherScatterBaseAddrType]>; def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> { let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier."; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 906c71d8b8dad..4e6be230e1e87 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -685,10 +685,6 @@ void CreateDescOp::build(OpBuilder &builder, OperationState &state, LogicalResult CreateDescOp::verify() { auto tdescTy = getTensorDescType(); - if (getRankOf(getSource()) > 1) - return emitOpError( - "Expecting the source is a 1D memref or pointer (uint64_t)."); - if (!tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc.\n"); diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 93a5a055b08c6..5d86dbf81e48f 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -387,7 +387,7 @@ func.func @load_gather_vc_3(%src: ui64) { // ----- func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) { %offsets = arith.constant dense<[0]> : vector<1xindex> - // expected-error@+1 {{Expecting the source is a 1D memref or pointer}} + // expected-error@+1 {{op operand #0 must be TensorDesc describing regions of interested data}} xegpu.prefetch %src[%offsets]: memref<4x4xf32>, vector<1xindex> return } @@ -428,7 +428,7 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) { %val = arith.constant dense<2.9>: vector<4xf16> %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> - // expected-error@+1 {{Expecting the dest is a 1D memref or pointer}} + // expected-error@+1 {{op operand #1 must be TensorDesc describing regions of interested data}} xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<4x4xf16>, vector<1xindex>, vector<1xi1> return @@ -447,7 +447,7 @@ func.func @load_gather_offset_wi_2(%src: ui64) { func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) { %mask = arith.constant dense<1>: vector<1xi1> %offsets = arith.constant dense<[0]> : vector<1xindex> - // expected-error@+1 {{Expecting the source is a 1D memref or pointer}} + // expected-error@+1 {{op operand #0 must be TensorDesc describing regions of interested data}} %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : memref<4x4xf32>, vector<1xindex>, vector<1xi1> -> vector<2xf32> return } From 2e985ce836322927ae7c63a2db5f503636c8c645 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 20 Aug 2025 21:22:58 +0000 Subject: [PATCH 02/11] Update xegpu op descriptions. --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 83 ++++++++++++++----- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index ac11210c6d0c2..0f2a13e1ae16c 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -511,6 +511,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { match the dimension of offsets. It may also has a second dimension corresponding to the chunk_size if the chunk size is larger than 1. + This op is not available in SIMT mode. + Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] ```mlir %a = memref.alloc() : memref<1024xf32> @@ -618,6 +620,15 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { : memref<1024xf32>, vector<4xindex> ``` + Example 3 (SIMT mode): + SIMT mode only accepts the offsets variant. + ```mlir + xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : memref<256xf32>, vector<1xindex> + ``` + }]; let arguments = (ins XeGPU_GatherScatterSourceType: $source, @@ -671,8 +682,18 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { The mask operand masks out memory access so that it is safe to pass out-of-boundary addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. - In SIMT mode, the result vector represents the data to be loaded by each work-item. - Each work-item recieves a `chunk_size` number of elements. + In SIMT mode, the result is a 1D vector that represents the data to be loaded by + each work-item. + + `source` represents the memory region to be loaded from, which can be either a + tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). + In case of tensor_desc, offsets come from the producer create_tdesc op. + tensor_desc cannot be used in SIMT mode. + `offsets` represents offsets from source. required if `source` in not a TensorDescType. + offsets is a vector of `index` type and vector length is either the subgroup size + or 1 in SIMT mode. + `mask` is a vector of `i1` type, which is used to mask out the memory access. + mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. Example 1: ```mlir @@ -692,16 +713,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { vector<16xi1> -> vector<16x8xf32> ``` - Example 3 (SIMT mode): - ```mlir - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - vector<16xi1> -> vector<8xf32> - ``` - - Example 4: + Example 3: A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc". The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc @@ -716,6 +728,16 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` + Example 4 (SIMT mode): + SIMT mode only accepts the offsets variant. chunk_size can be inferred from result + type. In this example, chunk_size is 8. + ```mlir + %2 = xegpu.load %1[%2], %0 <{l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint}> + : memref<128xf32>, vector<1xindex>, vector<1xi1> -> vector<8xf32> + ``` + }]; let arguments = (ins XeGPU_GatherScatterSourceType: $source, @@ -785,8 +807,19 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is introduced on purpose, making sure users are aware of this implicit transformation. - In SIMT mode, the input vector represents the data to be stored by each work-item. - Each work-item stores a `chunk_size` number of elements. + In SIMT mode, the result is a 1D vector that represents the data to be stored by + each work-item. + + `value` represents the data to be stored. + `dest` represents the memory region to be stored to, which can be either a + tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). + In case of tensor_desc, offsets come from the producer create_tdesc op. + tensor_desc cannot be used in SIMT mode. + `offsets` represents offsets from dest. required if `source` in not a TensorDescType. + offsets is a vector of `index` type and vector length is either the subgroup size + or 1 in SIMT mode. + `mask` is a vector of `i1` type, which is used to mask out the memory access. + mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. Example 1: ```mlir @@ -804,15 +837,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> ``` - Example 3 (SIMT mode): - ```mlir - xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}> - : vector<8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr> vector<16xi1> - ``` - - Example 4: + Example 3: A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc". The dest operand could be a raw pointer (uint64_t). @@ -828,6 +853,16 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` + Example 4 (SIMT mode): + SIMT mode only accepts the offsets variant. chunk_size can be inferred from value + type. In this example, chunk_size is 8. + ```mlir + xegpu.store %0, %1[%2], %3 <{l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint}> + : vector<8xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1> + ``` + }]; let arguments = (ins @@ -896,6 +931,8 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", update the offset per work-item, so its offsets contains values representing shifts for each work-item. + This op is not available in SIMT mode. + Example: ```mlir %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> From 46857e36af6725c8eaff25e584f4e71bc7361949 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 20 Aug 2025 21:33:49 +0000 Subject: [PATCH 03/11] Update op validation. --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 37 +++++++++++++------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 4e6be230e1e87..cf5da7a416846 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -58,13 +58,6 @@ static SmallVector getShapeOf(Type type) { return shape; } -static int64_t getRankOf(Value val) { - auto type = val.getType(); - if (auto ty = llvm::dyn_cast(type)) - return ty.getRank(); - return 0; -} - static bool isReadHintOrNone(const CachePolicyAttr &attr) { if (!attr) return true; @@ -719,13 +712,15 @@ LogicalResult CreateDescOp::verify() { LogicalResult PrefetchOp::verify() { auto tdescTy = getTensorDescType(); + if (!tdescTy && !getOffsets()) + return emitOpError("Expects offsets."); + + if (tdescTy && getOffsets()) + return emitOpError("offsets not allowed."); + if (tdescTy && !tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc."); - if (!tdescTy && getRankOf(getSource()) > 1) - return emitOpError( - "Expecting the source is a 1D memref or pointer (uint64_t)."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -753,13 +748,15 @@ LogicalResult LoadGatherOp::verify() { auto maskTy = getMaskType(); auto valueTy = getValueType(); + if (!tdescTy && !getOffsets()) + return emitOpError("Expects offsets."); + + if (tdescTy && getOffsets()) + return emitOpError("offsets not allowed."); + if (tdescTy && !tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc."); - if (!tdescTy && getRankOf(getSource()) > 1) - return emitOpError( - "Expecting the source is a 1D memref or pointer (uint64_t)."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -800,13 +797,15 @@ LogicalResult StoreScatterOp::verify() { auto maskTy = getMaskType(); auto valueTy = getValueType(); + if (!tdescTy && !getOffsets()) + return emitOpError("Expects offsets."); + + if (tdescTy && getOffsets()) + return emitOpError("offsets not allowed."); + if (tdescTy && !tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc."); - if (!tdescTy && getRankOf(getDest()) > 1) - return emitOpError( - "Expecting the dest is a 1D memref or pointer (uint64_t)."); - if (!isWriteHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); From 1077b3abebc3fdc0cd97a6ddee3be845b449d71c Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 20 Aug 2025 22:57:58 +0000 Subject: [PATCH 04/11] Update op descriptions --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 0f2a13e1ae16c..8fd04a5d4cdcf 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -683,7 +683,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. In SIMT mode, the result is a 1D vector that represents the data to be loaded by - each work-item. + each work-item. If size is not 1, size should be equal to the chunk size, `source` represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). @@ -694,6 +694,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { or 1 in SIMT mode. `mask` is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. + `chunk_size` (optional) represents contiguous number of elements to load from per work item. Example 1: ```mlir @@ -808,7 +809,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { introduced on purpose, making sure users are aware of this implicit transformation. In SIMT mode, the result is a 1D vector that represents the data to be stored by - each work-item. + each work-item. If size is not 1, size should be equal to the chunk size. `value` represents the data to be stored. `dest` represents the memory region to be stored to, which can be either a @@ -820,6 +821,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { or 1 in SIMT mode. `mask` is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. + `chunk_size` (optional) represents contiguous number of elements to store to per work item. Example 1: ```mlir From af57f45e3536a695bd1ef19fdad213b211332e36 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 20 Aug 2025 23:30:31 +0000 Subject: [PATCH 05/11] Add invalid op checks for new op restriction. --- mlir/test/Dialect/XeGPU/invalid.mlir | 55 ++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 5d86dbf81e48f..c076ac78b9edd 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -392,6 +392,23 @@ func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) { return } +// ----- +func.func @prefetch_offset_wi_2(%src: memref<16xf32>) { + %offsets = arith.constant dense<[0]> : vector<1xindex> + %1 = xegpu.create_tdesc %src, %offsets : memref<16xf32>, vector<1xindex> + -> !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr> + // expected-error@+1 {{offsets not allowed}} + xegpu.prefetch %1[%offsets]: !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr>, vector<1xindex> + return +} + +// ----- +func.func @prefetch_offset_wi_3(%src: memref<16xf32>) { + // expected-error@+1 {{Expects offsets}} + xegpu.prefetch %src: memref<16xf32> + return +} + // ----- func.func @load_gather_offset_sg(%src: memref) { %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> @@ -434,6 +451,44 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) { return } +// ----- +func.func @store_scatter_offset_wi_3(%src: memref<16xf16>) { + %val = arith.constant dense<2.9>: vector<1xf16> + %mask = arith.constant dense<1>: vector<1xi1> + // expected-error@+1 {{Expects offsets}} + xegpu.store %val, %src, %mask + : vector<1xf16>, memref<16xf16>, vector<1xi1> + return +} + +// ----- +func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>) { + %val = arith.constant dense<2.9>: vector<1xf16> + %offsets = arith.constant dense<[0]> : vector<1xindex> + %mask = arith.constant dense<1>: vector<1xi1> + // expected-error@+1 {{offsets not allowed}} + xegpu.store %val, %src[%offsets], %mask + : vector<1xf16>, !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> + return +} + +// ----- +func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>) { + %mask = arith.constant dense<1>: vector<1xi1> + %offsets = arith.constant dense<[0]> : vector<1xindex> + // expected-error@+1 {{offsets not allowed}} + %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> -> vector<2xf16> + return +} + +// ----- +func.func @load_gather_offset_wi_3(%src: ui64) { + %mask = arith.constant dense<1>: vector<1xi1> + // expected-error@+1 {{Expects offsets}} + %2 = xegpu.load %src, %mask <{chunk_size = 2}> : ui64, vector<1xi1> -> vector<2xf16> + return +} + // ----- func.func @load_gather_offset_wi_2(%src: ui64) { %mask = arith.constant dense<1>: vector<1xi1> From daf18393e466df4d2a2d42d3ac6955d0a9cd812c Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 21 Aug 2025 16:39:22 +0000 Subject: [PATCH 06/11] Allow scalar offset for SIMT mode gather / scatter / prefetch ops. --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 8fd04a5d4cdcf..1d4f89b108e52 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -609,7 +609,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { Example 2: A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc". - The source operand could be a raw pointer (uint64_t). + The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc for the restriction of memref. ```mlir %a = memref.alloc() : memref<1024xf32> @@ -632,7 +632,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { }]; let arguments = (ins XeGPU_GatherScatterSourceType: $source, - Optional: $offsets, + Optional>: $offsets, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); @@ -742,7 +742,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { }]; let arguments = (ins XeGPU_GatherScatterSourceType: $source, - Optional: $offsets, + Optional>: $offsets, XeGPU_MaskType: $mask, OptionalAttr: $chunk_size, OptionalAttr: $l1_hint, @@ -870,7 +870,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { let arguments = (ins XeGPU_ValueType: $value, XeGPU_GatherScatterSourceType: $dest, - Optional: $offsets, + Optional>: $offsets, XeGPU_MaskType: $mask, OptionalAttr: $chunk_size, OptionalAttr: $l1_hint, From 98f2caaf767efa37ae5b67945836f01037c065f7 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 21 Aug 2025 17:09:07 +0000 Subject: [PATCH 07/11] Update op description. --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 108 +++++++++++------- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 11 +- 2 files changed, 73 insertions(+), 46 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 1d4f89b108e52..bf27bbc85a1f9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -70,28 +70,32 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface future). Elements in the subview continuous in each dimension. It encodes the following important information for supporting Intel hardware features: - * source: an object representing (starting address/pointer of) a memory region. + Arguments: + - `source`: an object representing (starting address/pointer of) a memory region. It can be either a memref object, or simply a pointer represented by uint64_t type. For the case of dynamic memrefs or pointer, the shape and layout information of the memory region should be explicitly passed via `shape` and `strides` parameters. - * offsets: index values represents offsets from the "source" at the each dimension + - `offsets`: index values represents offsets from the "source" at the each dimension at which the subview of the target memory will be created. It is encoded via "offsets" and "const_offsets", such that it can accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). - * shape: the shape information of the memory region pointed by the "source". It is + - `shape`: the shape information of the memory region pointed by the "source". It is typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. But if "source" is simply a pointer represented as uint64_t type, or a memref type without shape information e.g., memref, the shape information has to be explicitly passed via the "shape" and "const_shape" arguments. - * strides: the strides of the memory region pointed by the "source". Similar to shape, + - `strides`: the strides of the memory region pointed by the "source". Similar to shape, it is typically encoded via the MemRefType of the source too. But if "source" is simply a pointer represented as uint64_t type, or a memref type without shape information e.g., memref, the strides information has to be explicitly passed via the "strides" and "const_strides" argument. + Results: + - `res`: nd tensor descriptor + Example 1 (suppose the tensor shape inferred by the compiler is 8x16): ```mlir %0 = memref.alloc() : memref<1024x1024xf32> @@ -500,13 +504,17 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { (scattered) subviews, allowing each work-item in a subgroup specifying their own offset. It accepts the following parameters: - * source: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened + Arguments: + - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened memory object. - * offsets: a vector containing offsets of each access point. Its size + - `offsets`: a vector containing offsets of each access point. Its size is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, implying each element in the vector corresponds to a work-item (SIMT lane) in the subgroup. + Results: + - `res`: scattered tensor descriptor + The first dimension of the result TensorDesc corresponds to work-items, so it should match the dimension of offsets. It may also has a second dimension corresponding to the chunk_size if the chunk size is larger than 1. @@ -539,8 +547,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { ``` }]; - let arguments = (ins XeGPU_GatherScatterBaseAddrType: $source, - XeGPU_OffsetType: $offsets); + let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source, + XeGPU_OffsetType:$offsets); let results = (outs XeGPU_TensorDesc:$TensorDesc); let builders = [ @@ -598,6 +606,16 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { As compared to prefetch_nd, which works on non-scattered TensorDesc, it works on scattered TensorDesc instead. + Arguments: + - `source`: represents the memory region to be loaded from, which can be either a + tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). + In case of tensor_desc, offsets come from the producer create_tdesc op. + tensor_desc cannot be used in SIMT mode. + - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. + offsets is a vector of `index` type and vector length is either the subgroup size + or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. + Example 1: ```mlir xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, @@ -631,11 +649,11 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { }]; - let arguments = (ins XeGPU_GatherScatterSourceType: $source, - Optional>: $offsets, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + let arguments = (ins XeGPU_GatherScatterSourceType:$source, + Optional>:$offsets, + OptionalAttr:$l1_hint, + OptionalAttr:$l2_hint, + OptionalAttr:$l3_hint); let extraClassDeclaration = extraBaseClassDeclaration # [{ Type getSourceType() { @@ -685,16 +703,22 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { In SIMT mode, the result is a 1D vector that represents the data to be loaded by each work-item. If size is not 1, size should be equal to the chunk size, - `source` represents the memory region to be loaded from, which can be either a + Arguments: + - `source`: represents the memory region to be loaded from, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. tensor_desc cannot be used in SIMT mode. - `offsets` represents offsets from source. required if `source` in not a TensorDescType. + - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size - or 1 in SIMT mode. - `mask` is a vector of `i1` type, which is used to mask out the memory access. + or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. - `chunk_size` (optional) represents contiguous number of elements to load from per work item. + - `chunk_size`: (optional) represents contiguous number of elements to load from per work item. + - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. + + Results: + - `res`: represents loaded data + Example 1: ```mlir @@ -717,7 +741,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { Example 3: A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc". - The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc + The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc for the restriction of memref. ```mlir %a = memref.alloc() : memref<1024xf32> @@ -741,13 +765,12 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { }]; - let arguments = (ins XeGPU_GatherScatterSourceType: $source, - Optional>: $offsets, - XeGPU_MaskType: $mask, - OptionalAttr: $chunk_size, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + let arguments = (ins XeGPU_GatherScatterSourceType:$source, + Optional>:$offsets, + XeGPU_MaskType:$mask, OptionalAttr:$chunk_size, + OptionalAttr:$l1_hint, + OptionalAttr:$l2_hint, + OptionalAttr:$l3_hint); let results = (outs XeGPU_ValueType: $value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -801,7 +824,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { let summary = "store data to scattered memory locations."; - let description = [{ It (aka. store) stores data to scattered memory locations. The value is + let description = + [{ It (aka. store) stores data to scattered memory locations. The value is typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter` @@ -811,17 +835,19 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { In SIMT mode, the result is a 1D vector that represents the data to be stored by each work-item. If size is not 1, size should be equal to the chunk size. - `value` represents the data to be stored. - `dest` represents the memory region to be stored to, which can be either a + Arguments: + - `value`: represents the data to be stored. + - `dest`: represents the memory region to be stored to, which can be either a tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). In case of tensor_desc, offsets come from the producer create_tdesc op. tensor_desc cannot be used in SIMT mode. - `offsets` represents offsets from dest. required if `source` in not a TensorDescType. + - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType. offsets is a vector of `index` type and vector length is either the subgroup size - or 1 in SIMT mode. - `mask` is a vector of `i1` type, which is used to mask out the memory access. + or 1 in SIMT mode. scalar offset is also valid for SIMT mode. + - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. - `chunk_size` (optional) represents contiguous number of elements to store to per work item. + - `chunk_size`: (optional) represents contiguous number of elements to store to per work item. + - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. Example 1: ```mlir @@ -867,15 +893,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { }]; - let arguments = (ins - XeGPU_ValueType: $value, - XeGPU_GatherScatterSourceType: $dest, - Optional>: $offsets, - XeGPU_MaskType: $mask, - OptionalAttr: $chunk_size, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); + let arguments = (ins XeGPU_ValueType:$value, + XeGPU_GatherScatterSourceType:$dest, + Optional>:$offsets, + XeGPU_MaskType:$mask, OptionalAttr:$chunk_size, + OptionalAttr:$l1_hint, + OptionalAttr:$l2_hint, + OptionalAttr:$l3_hint); let extraClassDeclaration = extraBaseClassDeclaration # [{ Type getDestType() { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 53ecedab5406d..84902b2039643 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -16,15 +16,17 @@ include "mlir/IR/BuiltinTypes.td" def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>; -def XeGPU_PointerType: AnyTypeOf<[UI64, UI32, I64, I32]>; -def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>; +def XeGPU_PointerType : AnyTypeOf<[UI64, UI32, I64, I32]>; +def XeGPU_BaseAddrType + : AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>; def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>; def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>; def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>; def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; -def XeGPU_GatherScatterBaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>; +def XeGPU_GatherScatterBaseAddrType + : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], @@ -191,7 +193,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", let genVerifyDecl = 1; } -def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,XeGPU_GatherScatterBaseAddrType]>; +def XeGPU_GatherScatterSourceType + : AnyTypeOf<[XeGPU_TensorDesc, XeGPU_GatherScatterBaseAddrType]>; def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> { let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier."; From 68492e2091b200c121c874ef9ae7acde7b6cf4b1 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 21 Aug 2025 17:52:04 +0000 Subject: [PATCH 08/11] Allow create_tdesc and update_offset in SIMT mode. Allow scalar mask for load/store/prefetch op in SIMT mode. --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index bf27bbc85a1f9..b8d706bd9e6cb 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -519,8 +519,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { match the dimension of offsets. It may also has a second dimension corresponding to the chunk_size if the chunk size is larger than 1. - This op is not available in SIMT mode. - Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] ```mlir %a = memref.alloc() : memref<1024xf32> @@ -713,6 +711,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { or 1 in SIMT mode. scalar offset is also valid for SIMT mode. - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. + scalar mask is also valid for SIMT mode. - `chunk_size`: (optional) represents contiguous number of elements to load from per work item. - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. @@ -767,7 +766,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { let arguments = (ins XeGPU_GatherScatterSourceType:$source, Optional>:$offsets, - XeGPU_MaskType:$mask, OptionalAttr:$chunk_size, + AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint); @@ -846,6 +845,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { or 1 in SIMT mode. scalar offset is also valid for SIMT mode. - `mask`: is a vector of `i1` type, which is used to mask out the memory access. mask is a vector of size equal to the subgroup size, or 1 in SIMT mode. + scalar mask is also valid for SIMT mode. - `chunk_size`: (optional) represents contiguous number of elements to store to per work item. - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. @@ -896,7 +896,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { let arguments = (ins XeGPU_ValueType:$value, XeGPU_GatherScatterSourceType:$dest, Optional>:$offsets, - XeGPU_MaskType:$mask, OptionalAttr:$chunk_size, + AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint); @@ -957,8 +957,6 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", update the offset per work-item, so its offsets contains values representing shifts for each work-item. - This op is not available in SIMT mode. - Example: ```mlir %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> From 9555b06c5192a6eff4f41038917ee64776a40759 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 21 Aug 2025 23:44:13 +0000 Subject: [PATCH 09/11] Allow scalar mask/offsets/value variant for load/store and update op validation. --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 4 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 38 +++++++++++++------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index b8d706bd9e6cb..e6661925d560e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -770,7 +770,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint); - let results = (outs XeGPU_ValueType: $value); + let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -893,7 +893,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { }]; - let arguments = (ins XeGPU_ValueType:$value, + let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value, XeGPU_GatherScatterSourceType:$dest, Optional>:$offsets, AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index cf5da7a416846..497ae5a8b589f 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -82,13 +82,18 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy, if (!tdescTy.isScattered()) return emitError() << "Expects a scattered TensorDesc."; - if (!valueTy) - return emitError() << "Expecting a vector type result."; + auto chunkSize = tdescTy.getChunkSizeAsInt(); + if (!valueTy) { + if (chunkSize > 1) + return emitError() << "Expecting chunk size == 1 for scalar result"; + if (dyn_cast(maskTy)) + return emitError() << "Expecting a vector type result."; + return success(); + } auto maskShape = getShapeOf(maskTy); auto valueShape = getShapeOf(valueTy); auto tdescShape = getShapeOf(tdescTy); - auto chunkSize = tdescTy.getChunkSizeAsInt(); if (valueTy.getElementType() != tdescTy.getElementType()) return emitError() @@ -117,13 +122,21 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy, } static LogicalResult -isValidGatherScatterBufferParams(Type maskTy, VectorType valueTy, - int64_t chunkSize, +isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, + VectorType valueTy, int64_t chunkSize, function_ref emitError) { - if (!valueTy) - return emitError() << "Expecting a vector type result."; - + if (!valueTy) { + if (chunkSize > 1) + return emitError() << "Expecting chunk size == 1 for scalar result"; + auto maskVecTy = dyn_cast(maskTy); + auto offsetsVecTy = dyn_cast(offsetsTy); + if (maskVecTy || offsetsVecTy) + return emitError() << "Expecting scalar mask and offsets."; + else if (maskVecTy && offsetsVecTy) + return emitError() << "Expecting a vector type result."; + return success(); + } auto maskShape = getShapeOf(maskTy); auto valueShape = getShapeOf(valueTy); @@ -142,9 +155,8 @@ isValidGatherScatterBufferParams(Type maskTy, VectorType valueTy, return emitError() << "Mask should match value except the chunk size dim."; } - llvm::SmallVector expectedMaskShape(valueShape); - if (chunkSize > 1) + if (maskSize > 1 && chunkSize > 1) expectedMaskShape.pop_back(); if (expectedMaskShape != maskShape) return emitError() << "Mask should match value except the chunk size dim."; @@ -776,7 +788,8 @@ LogicalResult LoadGatherOp::verify() { if (memTy && (valueTy.getElementType() != memTy.getElementType())) return emitError() << "Value should have the same element type as MemRef."; - return isValidGatherScatterBufferParams(maskTy, valueTy, chunkSize, + auto offsetsTy = getOffsets().getType(); + return isValidGatherScatterBufferParams(offsetsTy, maskTy, valueTy, chunkSize, [&]() { return emitOpError(); }); } @@ -826,7 +839,8 @@ LogicalResult StoreScatterOp::verify() { if (memTy && (valueTy.getElementType() != memTy.getElementType())) return emitError() << "Value should have the same element type as MemRef."; - return isValidGatherScatterBufferParams(maskTy, valueTy, chunkSize, + auto offsetsTy = getOffsets().getType(); + return isValidGatherScatterBufferParams(offsetsTy, maskTy, valueTy, chunkSize, [&]() { return emitOpError(); }); } From 95dadb9c6db3d4747bbe139bbd909eefa0386253 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Fri, 22 Aug 2025 00:34:28 +0000 Subject: [PATCH 10/11] Update op validation logic for SIMT mode and add additional valid op tests. --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 7 ++- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 23 +++++--- mlir/test/Dialect/XeGPU/ops.mlir | 56 +++++++++++++++++++ 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index e6661925d560e..2439b0387d09b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -901,7 +901,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { OptionalAttr:$l2_hint, OptionalAttr:$l3_hint); - let extraClassDeclaration = extraBaseClassDeclaration # [{ + let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { return getDest().getType(); } @@ -917,6 +917,11 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { return dyn_cast(getDestType()); } + mlir::Type getElementType() { + auto type = getValue().getType(); + return getElementTypeOrSelf(type); + } + VectorType getValueType() { return llvm::dyn_cast(getValue().getType()); } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 497ae5a8b589f..25abc48aab7b8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -126,26 +126,33 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, VectorType valueTy, int64_t chunkSize, function_ref emitError) { + auto maskVecTy = dyn_cast(maskTy); + auto offsetsVecTy = dyn_cast(offsetsTy); if (!valueTy) { if (chunkSize > 1) return emitError() << "Expecting chunk size == 1 for scalar result"; - auto maskVecTy = dyn_cast(maskTy); - auto offsetsVecTy = dyn_cast(offsetsTy); if (maskVecTy || offsetsVecTy) return emitError() << "Expecting scalar mask and offsets."; else if (maskVecTy && offsetsVecTy) return emitError() << "Expecting a vector type result."; return success(); } + + auto valueSize = valueTy.getNumElements(); + // SIMT mode with scalar mask and offsets. + if (!maskVecTy && !offsetsVecTy) { + if (valueSize != chunkSize) + return emitError() << "value elements must match chunk size " + << chunkSize; + return success(); + } auto maskShape = getShapeOf(maskTy); auto valueShape = getShapeOf(valueTy); - auto maskVecTy = dyn_cast(maskTy); if (!maskVecTy) return emitError() << "Expecting a vector type mask."; int64_t maskSize = maskVecTy.getNumElements(); - auto valueSize = valueTy.getNumElements(); if (chunkSize > 1) { if ((valueTy.getRank() == 1) && (valueSize != chunkSize)) return emitError() << "value elements must match chunk size " @@ -156,7 +163,9 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, << "Mask should match value except the chunk size dim."; } llvm::SmallVector expectedMaskShape(valueShape); - if (maskSize > 1 && chunkSize > 1) + if (maskSize == 1) + return success(); + if (chunkSize > 1) expectedMaskShape.pop_back(); if (expectedMaskShape != maskShape) return emitError() << "Mask should match value except the chunk size dim."; @@ -785,7 +794,7 @@ LogicalResult LoadGatherOp::verify() { uint64_t chunkSize = static_cast(getChunkSize().value_or(1)); auto memTy = dyn_cast(srcTy); - if (memTy && (valueTy.getElementType() != memTy.getElementType())) + if (memTy && (getElementType() != memTy.getElementType())) return emitError() << "Value should have the same element type as MemRef."; auto offsetsTy = getOffsets().getType(); @@ -836,7 +845,7 @@ LogicalResult StoreScatterOp::verify() { uint64_t chunkSize = static_cast(getChunkSize().value_or(1)); auto memTy = dyn_cast(destTy); - if (memTy && (valueTy.getElementType() != memTy.getElementType())) + if (memTy && (getElementType() != memTy.getElementType())) return emitError() << "Value should have the same element type as MemRef."; auto offsetsTy = getOffsets().getType(); diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 35342eca1354c..f524ccfd7d52a 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -508,6 +508,34 @@ gpu.func @simt_load_3(%src: ui64) { gpu.return } +// CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) { +gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) { + // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> + %0 = xegpu.load %arg0[%arg1], %arg2 <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> + gpu.return +} + +// CHECK: gpu.func @simt_load_5(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) { +gpu.func @simt_load_5(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) { + // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> + %0 = xegpu.load %arg0[%arg1], %arg2 : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> + gpu.return +} + +// CHECK: gpu.func @simt_load_6(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: index, %[[arg2:.*]]: i1) { +gpu.func @simt_load_6(%arg0: memref<256xf16>, %arg1: index, %arg2: i1) { + // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, index, i1 -> vector<8xf16> + %0 = xegpu.load %arg0[%arg1], %arg2 <{chunk_size = 8 : i64}> : memref<256xf16>, index, i1 -> vector<8xf16> + gpu.return +} + +// CHECK: gpu.func @simt_load_7(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: index, %[[arg2:.*]]: i1) { +gpu.func @simt_load_7(%arg0: memref<256xf16>, %arg1: index, %arg2: i1) { + // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] : memref<256xf16>, index, i1 -> f16 + %0 = xegpu.load %arg0[%arg1], %arg2 : memref<256xf16>, index, i1 -> f16 + gpu.return +} + // CHECK: gpu.func @subgroup_load_4(%[[arg0:.*]]: ui64) { gpu.func @subgroup_load_4(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex> @@ -621,6 +649,34 @@ gpu.func @simt_store_3(%src: ui64) { gpu.return } +// CHECK: gpu.func @simt_store_4(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: vector<1xindex>, %[[arg3:.*]]: vector<1xi1>) { +gpu.func @simt_store_4(%arg0: vector<8xf16>, %arg1: memref<256xf16>, %arg2: vector<1xindex>, %arg3: vector<1xi1>) { + // CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> + xegpu.store %arg0, %arg1[%arg2], %arg3 <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> + gpu.return +} + +// CHECK: gpu.func @simt_store_5(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: index, %[[arg3:.*]]: i1) { +gpu.func @simt_store_5(%arg0: vector<8xf16>, %arg1: memref<256xf16>, %arg2: index, %arg3: i1) { + // CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, index, i1 + xegpu.store %arg0, %arg1[%arg2], %arg3 <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, index, i1 + gpu.return +} + +// CHECK: gpu.func @simt_store_6(%[[arg0:.*]]: vector<1xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: vector<1xindex>, %[[arg3:.*]]: vector<1xi1>) { +gpu.func @simt_store_6(%arg0: vector<1xf16>, %arg1: memref<256xf16>, %arg2: vector<1xindex>, %arg3: vector<1xi1>) { + // CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> + xegpu.store %arg0, %arg1[%arg2], %arg3 : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> + gpu.return +} + +// CHECK: gpu.func @simt_store_7(%[[arg0:.*]]: f16, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: index, %[[arg3:.*]]: i1) { +gpu.func @simt_store_7(%arg0: f16, %arg1: memref<256xf16>, %arg2: index, %arg3: i1) { + // CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] : f16, memref<256xf16>, index, i1 + xegpu.store %arg0, %arg1[%arg2], %arg3 : f16, memref<256xf16>, index, i1 + gpu.return +} + // CHECK: gpu.func @subgroup_store_4(%[[arg0:.*]]: ui64) { gpu.func @subgroup_store_4(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex> From 269ff58d645131fac8b3cf0140b5cbbe8a96bf4f Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Fri, 22 Aug 2025 18:22:38 +0000 Subject: [PATCH 11/11] xegpu.prefetch : Add attribute to use in case is a pointer (ui64, ui32, i64, i32) --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 15 ++++++++++++++- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 10 +++++++++- mlir/test/Dialect/XeGPU/invalid.mlir | 16 ++++++++++++++++ mlir/test/Dialect/XeGPU/ops.mlir | 4 ++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 2439b0387d09b..afef9f1ef8138 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -613,6 +613,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { offsets is a vector of `index` type and vector length is either the subgroup size or 1 in SIMT mode. scalar offset is also valid for SIMT mode. - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache. + - `offset_align_byte`: required if `source` is a pointer. If `source` is not a pointer, + it is not allowed. Represents the alignment in bytes of each offset in offsets. Example 1: ```mlir @@ -645,13 +647,24 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { : memref<256xf32>, vector<1xindex> ``` + Example 4 (SIMT mode): + SIMT mode only accepts the offsets variant. + ```mlir + xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint, + offset_align_byte = 2} + : i64, vector<1xindex> + ``` + }]; let arguments = (ins XeGPU_GatherScatterSourceType:$source, Optional>:$offsets, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, - OptionalAttr:$l3_hint); + OptionalAttr:$l3_hint, + OptionalAttr:$offset_align_byte); let extraClassDeclaration = extraBaseClassDeclaration # [{ Type getSourceType() { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 25abc48aab7b8..d6e8d52ba5085 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -751,6 +751,13 @@ LogicalResult PrefetchOp::verify() { if (!isReadHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); + auto srcTy = getSourceType(); + if (srcTy.isInteger() && !getOffsetAlignByteAttr()) + return emitOpError("offset_align_byte is required with integer source."); + + if (getOffsetAlignByteAttr() && !srcTy.isInteger()) + return emitOpError("offset_align_byte only allowed with integer source."); + return success(); } @@ -758,7 +765,8 @@ void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value source, xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { - build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint); + build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint, + IntegerAttr{}); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index c076ac78b9edd..228ef69d9a478 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -409,6 +409,22 @@ func.func @prefetch_offset_wi_3(%src: memref<16xf32>) { return } +// ----- +func.func @prefetch_offset_wi_4(%src: memref<16xf32>) { + %offsets = arith.constant dense<[0]> : vector<1xindex> + // expected-error@+1 {{offset_align_byte only allowed with integer source.}} + xegpu.prefetch %src[%offsets] <{offset_align_byte = 4}>: memref<16xf32>, vector<1xindex> + return +} + +// ----- +func.func @prefetch_offset_wi_5(%src: i64) { + %offsets = arith.constant dense<[0]> : vector<1xindex> + // expected-error@+1 {{offset_align_byte is required with integer source.}} + xegpu.prefetch %src[%offsets] : i64, vector<1xindex> + return +} + // ----- func.func @load_gather_offset_sg(%src: memref) { %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index f524ccfd7d52a..bb379024a34d7 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -718,8 +718,8 @@ gpu.func @prefetch(%src: ui64) { gpu.func @prefetch_offset(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // CHECK: xegpu.prefetch %[[arg0]][%cst] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : ui64, vector<4xindex> - xegpu.prefetch %src[%0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: ui64, vector<4xindex> + // CHECK: xegpu.prefetch %[[arg0]][%cst] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, offset_align_byte = 2 : i64}> : ui64, vector<4xindex> + xegpu.prefetch %src[%0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, offset_align_byte = 2}>: ui64, vector<4xindex> gpu.return }