[Codegen] Use safer hoisting in OptimizeTensorInsertExtractSlices

Max191 · Max191 · commit f503a981db50 · 2026-01-27T11:54:46.000-06:00
Signed-off-by: Max Dawkins &lt;max.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
@@ -225,23 +225,6 @@ void hoistSubsetWithLoopInvariantTensor(RewriterBase &rewriter,
   }
 }
 
-void moveLoopInvariantCodeFromGenericOps(Operation *op) {
-  // linalg.generic operations are also loop-like, but they don't have
-  // LoopLikeOpInterface implemented for them.
-  op->walk([&](linalg::GenericOp genericOp) {
-    moveLoopInvariantCode(
-        &genericOp.getBodyRegion(),
-        [&](Value value, Region *) {
-          return !genericOp->isAncestor(value.getParentRegion()->getParentOp());
-        },
-        [&](Operation *op, Region *) {
-          return !isa<linalg::IndexOp>(op) && isMemoryEffectFree(op) &&
-                 isSpeculatable(op);
-        },
-        [&](Operation *op, Region *) { op->moveBefore(genericOp); });
-  });
-}
-
 namespace {
 struct CastLikeExtractSliceOpFolder final
     : OpRewritePattern<tensor::ExtractSliceOp> {
@@ -382,11 +365,8 @@ void OptimizeTensorInsertExtractSlicesPass::runOnOperation() {
     extractSliceOp->moveAfter(latestInsertionPoint);
   });
 
-  funcOp.walk([&](scf::ForOp forOp) { moveLoopInvariantCode(forOp); });
-  LDBG() << "after hoisting loop invariant code\n" << funcOp;
-
-  moveLoopInvariantCodeFromGenericOps(funcOp);
-  LDBG() << "after hoisting loop invariant code out of generic ops\n" << funcOp;
+  moveLoopInvariantCodeFromGuaranteedLoops(funcOp);
+  LDBG() << "after hoisting loop invariant code\n" << funcOp << "\n";
 
   // TODO: walking in some reverse / inside-out order would be more efficient
   // and would capture more cases.
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir
@@ -229,42 +229,6 @@ func.func @batch_matmul_with_padding_strategy(%arg0: tensor<1x?x1280xf16>, %arg1
 
 // -----
 
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDataTiling>} {
-  %c0_i32 = arith.constant 0 : i32
-  %c2 = arith.constant 2 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_count_x = hal.interface.workgroup.count[0] : index
-  scf.for %arg0 = %workgroup_id_x to %c2 step %workgroup_count_x {
-    %2 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>> -> tensor<1x3x2xi32>
-    %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0, 0], sizes = [1, 1, 1, 2, 8], strides = [1, 1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>> -> tensor<1x1x1x2x8xi32>
-    %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x1x1x2x8xi32>, vector<2x8xi32>
-    %5 = vector.transpose %4, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-    %6 = tensor.empty() : tensor<3x2xi32>
-    %7 = vector.transfer_write %5, %6[%c0, %c0] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-    %inserted_slice = tensor.insert_slice %7 into %2[0, 0, 0] [1, 3, 2] [1, 1, 1] : tensor<3x2xi32> into tensor<1x3x2xi32>
-    iree_tensor_ext.dispatch.tensor.store %inserted_slice, %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : tensor<1x3x2xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  }
-  return
-}
-
-// CHECK-LABEL: func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xi32>
-// CHECK:       scf.for
-// CHECK:         %[[READ:.+]] = vector.transfer_read
-// CHECK:         %[[TRANS:.+]] = vector.transpose %[[READ]], [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-// CHECK:         vector.transfer_write %[[TRANS]], %[[EMPTY]][%[[C0]], %[[C0]]] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-
-// -----
-
 func.func @subset_hoisting_invariant_tensor(%init: tensor<64x64xf32>, %t: tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -373,3 +337,30 @@ func.func @licm_generic(%source: tensor<32x32xf16>, %idx : index) -> tensor<32x3
 // CHECK: linalg.generic
 // CHECK-NOT: tensor.extract
 // CHECK: return
+
+// -----
+
+// Verify that loop invariant ops are not hoisted from regions that may not be
+// executed.
+func.func @no_hoist_from_possibly_unexecuted_region(%arg0: tensor<4x8xi32>) -> tensor<8x4xi32> {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %workgroup_id_x = hal.interface.workgroup.id[0] : index
+  %0 = tensor.empty() : tensor<8x4xi32>
+  %1 = scf.for %arg1 = %workgroup_id_x to %c1 step %c100 iter_args(%arg2 = %0) -> tensor<8x4xi32> {
+    %2 = vector.transfer_read %arg0[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x8xi32>, vector<2x8xi32>
+    %3 = vector.transpose %2, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
+    %4 = vector.transfer_write %3, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<8x2xi32>, tensor<8x4xi32>
+    scf.yield %4 : tensor<8x4xi32>
+  }
+  return %1 : tensor<8x4xi32>
+}
+
+// CHECK-LABEL: func.func @no_hoist_from_possibly_unexecuted_region
+// CHECK:       scf.for {{.*}} {
+// CHECK:         vector.transfer_read
+// CHECK:         vector.transpose
+// CHECK:         vector.transfer_write
+// CHECK:       }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir
@@ -43,10 +43,10 @@ func.func @winograd_input_transform() {
 }
 //   CHECK-LABEL:  func.func @winograd_input_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract
@@ -71,10 +71,10 @@ func.func @winograd_output_transform() {
 }
 //   CHECK-LABEL:  func.func @winograd_output_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract