[mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset and PrefetchNd ops. #138033

charithaintc · 2025-04-30T21:38:09Z

This PR adds support for SIMT distribution of UpdateNdOffset and PrefetchNd ops.

For both these ops distribution will remove the layout attribute from the tensor descriptor type. Everything else remains unchanged.

Example 1:

  #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
  gpu.warp_execute_on_lane_0(%laneid) -> () {
    ...
    xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #lo0>
  }

To

  %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
  !xegpu.tensor_desc<4x8xf32, #lo0>) {
    gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #lo0>
  }
  %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
    #lo0> -> !xegpu.tensor_desc<4x8xf32>
  xegpu.prefetch_nd %0 : !xegpu.tensor_desc<4x8xf32>

Example 2:

  #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
  %r = gpu.warp_execute_on_lane_0(%laneid) ->
                  (!xegpu.tensor_desc<4x8xf32, #lo0>) {
    ...
    %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
      !xegpu.tensor_desc<4x8xf32, #lo0>
    gpu.yield %update
  }
  ...

To

  %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
  !xegpu.tensor_desc<4x8xf32, #lo0>) {
    ...
    %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
      !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
    gup.yield %dead, %arg0, %c32, %c16
  }
  %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
       #lo0> -> !xegpu.tensor_desc<4x8xf32>
  %1 = xegpu.update_nd_offset %0, [%c32, %c16]:
    !xegpu.tensor_desc<4x8xf32>
  ...

llvmbot · 2025-05-05T20:24:16Z

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Author: Charitha Saumya (charithaintc)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/138033.diff

4 Files Affected:

(modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td (+1-1)
(modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp (+166-2)
(modified) mlir/test/Dialect/XeGPU/subgroup-distribution.mlir (+66)
(modified) mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir (+59)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5fa18754305ca..a892f701f724e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -409,7 +409,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
 }
 
 def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
-                [AllTypesMatch<["TensorDesc", "result"]>]> {
+                [Pure, AllTypesMatch<["TensorDesc", "result"]>]> {
   let summary = "It updates the offsets for the TensorDesc.";
   let description = [{The op updates the offset of the given TensorDesc.
     The offsets are relative offset to the current position in the number
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 019032f7743bf..c7128666da7e8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -301,6 +301,10 @@ class LayoutInfoPropagation
                              ArrayRef<LayoutInfoLattice *> operands,
                              ArrayRef<const LayoutInfoLattice *> results);
 
+  void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
+
   void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
                                    ArrayRef<LayoutInfoLattice *> operands,
                                    ArrayRef<const LayoutInfoLattice *> results);
@@ -352,6 +356,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
       .Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
         visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
       })
+      .Case<xegpu::PrefetchNdOp>([&](auto prefetchNdOp) {
+        visitPrefetchNdOp(prefetchNdOp, operands, results);
+      })
       // No need to propagate the layout to operands in CreateNdDescOp because
       // they are scalars (offsets, sizes, etc.).
       .Case<xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
@@ -381,6 +388,18 @@ LogicalResult LayoutInfoPropagation::visitOperation(
   return success();
 }
 
+void LayoutInfoPropagation::visitPrefetchNdOp(
+    xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Here we assign the default layout to the tensor descriptor operand of
+  // prefetch.
+  auto tdescTy = prefetch.getTensorDescType();
+  auto prefetchLayout = getDefaultLayoutInfo(
+      VectorType::get(tdescTy.getShape(), tdescTy.getElementType()));
+  // Propagate the layout to the source tensor descriptor.
+  propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
+}
+
 void LayoutInfoPropagation::visitVectorMultiReductionOp(
     vector::MultiDimReductionOp reduction,
     ArrayRef<LayoutInfoLattice *> operands,
@@ -1173,7 +1192,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
     newStoreOperands.push_back(resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
-    // For the tensor descriptor operand, the layout attibute is dropped after
+    // For the tensor descriptor operand, the layout attribute is dropped after
     // distribution. Types needs to be resolved in this case also.
     xegpu::TensorDescType distributedTensorDescTy =
         dropLayouts(storeOp.getTensorDescType());
@@ -1412,6 +1431,150 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+/// Sink an update_nd_offset op feeding into yield op of an enclosing
+/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
+/// original op that will not be used by the yield op (and should be cleaned
+/// up later). The yield op will bypass the updateOp's arguments. The tensor
+/// descriptor type is not distributed. Appropriate cast ops are inserted if
+/// the distributed types does not match expected xegpu SIMT types.
+/// Example:
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     ...
+///     %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
+///       !xegpu.tensor_desc<4x8xf32, #lo0>
+///     gpu.yield %update
+///   }
+///   ...
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+///   !xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     ...
+///     %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
+///       !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
+///     gup.yield %dead, %arg0, %c32, %c16
+///   }
+///   %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+///        #lo0> -> !xegpu.tensor_desc<4x8xf32>
+///   %1 = xegpu.update_nd_offset %0, [%c32, %c16]:
+///     !xegpu.tensor_desc<4x8xf32>
+///   ...
+/// ```
+struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
+    auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+    // new update op does not have layout attribute.
+    xegpu::TensorDescType newTensorDescTy =
+        dropLayouts(updateOp.getTensorDescType());
+
+    SmallVector<Value, 3> newYieldValues;
+    SmallVector<Type, 3> newYieldTypes;
+    for (Value operand : updateOp->getOperands()) {
+      newYieldValues.push_back(operand);
+      if (isa<xegpu::TensorDescType>(operand.getType())) {
+        newYieldTypes.push_back(newTensorDescTy);
+      } else {
+        newYieldTypes.push_back(operand.getType());
+      }
+    }
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newUpdateOperands;
+    for (size_t i : newRetIndices) {
+      // For the tensor descriptor operand, the layout attribute is dropped
+      // after distribution. Types needs to be resolved in this case.
+      if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+        newUpdateOperands.push_back(resolveDistributedTy(
+            newWarpOp.getResult(i), newTensorDescTy, rewriter));
+      } else {
+        newUpdateOperands.push_back(newWarpOp.getResult(i));
+      }
+    }
+    // Create a new update op outside the warp op.
+    auto newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
+        newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
+        removeTemporaryLayoutAttributes(updateOp->getAttrs()));
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
+    return success();
+  }
+};
+
+/// Distribute a prefetch_nd op at the end of enclosing
+/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
+/// through the warp op interface they would be propagated as returned values.
+/// Appropriate cast ops are inserted if the distributed types does not match
+/// expected xegpu SIMT types.
+///
+/// Example:
+///
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #lo0>
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
+//    !xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #lo0>
+///   }
+///   %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
+///     #lo0> -> !xegpu.tensor_desc<4x8xf32>
+///   xegpu.prefetch_nd %0 : !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
+    if (!prefetchOp)
+      return failure();
+    xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          prefetchOp, "the source tensor descriptor lacks layout attribute");
+
+    SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
+    SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+    // Create a new prefetch op outside the warp op with updated tensor
+    // descriptor type. Source tensor descriptor require type resolution.
+    xegpu::TensorDescType newTensorDescTy =
+        dropLayouts(prefetchOp.getTensorDescType());
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
+        newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
+    rewriter.create<xegpu::PrefetchNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
+        removeTemporaryLayoutAttributes(prefetchOp->getAttrs()));
+    rewriter.eraseOp(prefetchOp);
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -1430,7 +1593,8 @@ struct XeGPUSubgroupDistributePass final
 void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
   patterns.add<CreateNdDescDistribution, StoreNdDistribution,
-               LoadNdDistribution, DpasDistribution>(patterns.getContext());
+               LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
+               UpdateNdOffsetDistribution>(patterns.getContext());
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index f8f2cd55c28d0..1df0520980766 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -160,3 +160,69 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
   gpu.return
 }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @update_nd_offset_1d(
+// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @test {
+gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @update_nd_offset_2d
+// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<16xf32>, !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
+  xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
+gpu.module @test {
+gpu.func @prefetch_1d(%arg0: memref<256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
+  gpu.return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index a5468681e68dc..c7c82fc8dbb3c 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -561,3 +561,62 @@ func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
+
+// -----
+// CHECK: function: update_nd_offset_1d:
+// CHECK: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
+// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
+// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
+func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  return
+}
+
+// -----
+// CHECK: function: update_nd_offset_2d:
+// CHECK: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
+// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
+// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
+func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
+  xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
+  return
+}
+
+// -----
+// CHECK: function: prefetch_2d:
+// CHECK: layout for result #0: Not assigned.
+// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
+func.func @prefetch_2d(%arg0: memref<256x256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
+  return
+}
+
+// -----
+// CHECK: function: prefetch_1d:
+// CHECK: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
+func.func @prefetch_1d(%arg0: memref<256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
+  return
+}

charithaintc · 2025-05-06T03:00:56Z

@Garra1980 Please take a look.

chencha3 · 2025-05-06T14:49:38Z

@charithaintc could you help to add a description for the PR?

charithaintc · 2025-05-06T15:15:31Z

@charithaintc could you help to add a description for the PR?

added.

chencha3 · 2025-05-06T17:35:04Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

+///     ...
+///     %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
+///       !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
+///     gup.yield %dead, %arg0, %c32, %c16


is this a typo? it has two gpu.yield here.

why it is dead while still being used by gpu.yield?

is this a typo? it has two gpu.yield here.

fixed.

why it is dead while still being used by gpu.yield?

%dead is yielded but it does not have users outside the warpOp anymore. So if you run DCE it gets removed. Upstream has a separate pattern called warpOpDeadResult to get rid of this.

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

chencha3 · 2025-05-06T17:45:27Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

+    SmallVector<Value> newUpdateOperands;
+    for (size_t i : newRetIndices) {
+      // For the tensor descriptor operand, the layout attribute is dropped
+      // after distribution. Types needs to be resolved in this case.


what does resolve mean here?

in XeGPU SIMT code, the layout is dropped in TensorDesc. But upstream warpOp distribution infra does not understand TensorDesc type. So it will still return the desc with the layouts. resolve will add unrealized_cast to go from desc with layout to without.

Example:

/// ``` /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]> /// gpu.warp_execute_on_lane_0(%laneid) -> () { /// ... /// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// ``` /// To /// ``` /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> ( // !xegpu.tensor_desc<4x8xf32, #layout0>) { /// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32, /// #layout0> -> !xegpu.tensor_desc<4x8xf32> /// xegpu.prefetch_nd %0 : !xegpu.tensor_desc<4x8xf32>

I see. Thanks.

If updateOp is not used anymore, is it safe to drop all of its uses and remove it from the warp op?

This is done by the upstream pattern warpDeadResult, so we don't need to do anything here. Its clean separation of concerns.

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Garra1980 · 2025-05-06T19:49:34Z

@Garra1980 Please take a look.

LGTM, thanks!

chencha3

LGTM, left some nit comments

charithaintc · 2025-05-08T15:11:24Z

Hi @adam-smnk, can you please take a look. This one is a small PR. :-)

adam-smnk

Overall looks good - just minor suggestions

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

charithaintc · 2025-05-08T16:55:17Z

Overall looks good - just minor suggestions

Thanks. I addressed all your comments.

Jianhui-Li · 2025-05-18T18:12:20Z

LGTM

charithaintc added 6 commits April 30, 2025 21:33

move work from old branch

d06477e

save work

d5d2713

save work

6aa4aef

remove index ops

1649c52

remove index ops

8e0c7fd

add tests

a76de60

charithaintc changed the title ~~[mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset, PrefetchNd and GPU Index Ops.~~ [mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset and PrefetchNd ops. May 5, 2025

add tests

ee555d4

charithaintc marked this pull request as ready for review May 5, 2025 20:23

llvmbot added mlir:gpu mlir labels May 5, 2025

charithaintc requested review from adam-smnk and chencha3 May 5, 2025 20:24

Merge branch 'main' into xegpu_simt_dist_prefetch_gpu_index

14b4ff0

charithaintc requested a review from fschlimb May 6, 2025 03:07

chencha3 reviewed May 6, 2025

View reviewed changes

Garra1980 reviewed May 6, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp Outdated Show resolved Hide resolved

charithaintc added 2 commits May 6, 2025 20:06

save work

e1a920b

save work

18ef214

chencha3 approved these changes May 6, 2025

View reviewed changes

charithaintc requested a review from mshahneo May 7, 2025 15:15

Merge branch 'main' into xegpu_simt_dist_prefetch_gpu_index

683f5cf

adam-smnk approved these changes May 8, 2025

View reviewed changes

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td Outdated Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp Show resolved Hide resolved

save work

f4c3474

Merge branch 'main' into xegpu_simt_dist_prefetch_gpu_index

3d1476e

charithaintc and others added 2 commits May 8, 2025 18:46

Merge branch 'main' into xegpu_simt_dist_prefetch_gpu_index

5b00531

Merge branch 'main' into xegpu_simt_dist_prefetch_gpu_index

bb99a93

charithaintc merged commit e7dcf1b into llvm:main May 8, 2025
6 of 9 checks passed

[mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset and PrefetchNd ops. #138033

[mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset and PrefetchNd ops. #138033

Uh oh!

Conversation

charithaintc commented Apr 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

charithaintc commented May 6, 2025

Uh oh!

chencha3 commented May 6, 2025

Uh oh!

charithaintc commented May 6, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Garra1980 commented May 6, 2025

Uh oh!

chencha3 left a comment

Choose a reason for hiding this comment

Uh oh!

charithaintc commented May 8, 2025

Uh oh!

adam-smnk left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

charithaintc commented May 8, 2025

Uh oh!

Uh oh!

Jianhui-Li commented May 18, 2025

Uh oh!

Uh oh!

charithaintc commented Apr 30, 2025 •

edited

Loading

llvmbot commented May 5, 2025 •

edited

Loading