adding more test examples

misc fixes adding quick note for me to pick up later break consumer of layout op into separate fusion err quick fix on logic refactor to use resize for transform replay clear allocation domain revert resize change for allocation domain; wipe out allocation for layout op on fusion segment boundaries trying to patch allocation domain handling during allocation fix cases where allocation domain isn't available fixing allocation transform and fixing tests
NVIDIA · jjsjann123 · Sep 16, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
commit 303f27dd4fb6e004f9982b1c8a427f77205acde3
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
@@ -1893,6 +1893,11 @@ std::pair<IrCloner, std::unique_ptr<Fusion>> SegmentedFusion::makeFusion(
     if (inp->isDefinitionType<ReshapeOp>()) {
       NVF_ERROR(clone_tv != nullptr && clone_tv->isA<TensorView>());
       view_tvs.push_back(clone_tv->as<TensorView>());
+    } else if (inp->isDefinitionType<PreprocessGroupedMatmulInputSf>()) {
+      // There's no point of replaying allocation domain if we cannot index into TV anyway.
+      // TODO: check all uses are safe
+      auto* tv_ptr = clone_tv->as<TensorView>();
+      tv_ptr->setAllocationDomain(tv_ptr->getLogicalDomain(), true);
     }
   }
 

diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp
@@ -342,6 +342,12 @@ TensorView* preprocessGroupedMatmulInputSf(
   auto pad_to_max_extent = [&](IterDomain* id, int multiple) -> IterDomain* {
     auto* maximum_pad_value_per_group =
         IrBuilder::create<Val>(multiple - 1, DataType::Index);
+
+    // NOTE: resize sounds good in theory. In reality, we cannot index this operation anyway, so I question how much a resize op is buying us.
+    //       More importantly, resize still hits asserts in vectorization analysis (validateDeviceSplit ATM).
+    // I think this requires the expanded dimension to be constant, in order to transformation replay to be work properly. We'll rely on concretization to convert number of groups to be a constant value.
+    // return IterDomain::resize(id, input->fusion()->zeroVal(DataType::Index), SimplifyingIrBuilder::mulExpr(num_groups, maximum_pad_value_per_group));
+
     Val* padded_ext = SimplifyingIrBuilder::addExpr(
         id->extent(),
         SimplifyingIrBuilder::mulExpr(num_groups, maximum_pad_value_per_group));

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -91,6 +91,12 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
     return false;
   }
 
+  if (registry_utils::SchedulerTopologyChecker::hasConsumerOfNonIndexableOps(fusion)) {
+    scheduler_debug_utils::canScheduleRejectReason(
+        scheduler_type, "Fusion has consumer of non indexable ops.");
+    return false;
+  }
+
   return true;
 }
 

diff --git a/csrc/scheduler/registry_utils.cpp b/csrc/scheduler/registry_utils.cpp
@@ -1030,6 +1030,17 @@ bool SchedulerTopologyChecker::hasResizeAndIndexOps(Fusion* fusion) {
   return false;
 }
 
+bool SchedulerTopologyChecker::hasConsumerOfNonIndexableOps(Fusion* fusion) {
+  for (auto expr : fusion->exprs()) {
+    if (expr->isOneOf<PreprocessGroupedMatmulInputSf>()) {
+       if (!ir_utils::getTvOutput(expr)->uses().empty()) {
+         return true;
+       }
+    }
+  }
+  return false;
+}
+
 namespace {
 
 // Return true when there's a producer-consumer relationship among a

diff --git a/csrc/scheduler/registry_utils.h b/csrc/scheduler/registry_utils.h
@@ -107,6 +107,8 @@ class SchedulerTopologyChecker {
 
   static bool hasResizeAndIndexOps(Fusion* fusion);
 
+  static bool hasConsumerOfNonIndexableOps(Fusion* fusion);
+
   // Checks if a series of reshape ops creates a cycle in the ID
   // graph. It is not currently supported. For example,
   // propagateReshapeTransforms won't work as it won't find any

diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -184,11 +184,8 @@ TEST_F(LayoutOpTest, SchedulerKernel) {
   fusion.addInput(offsets);
   fusion.addInput(rounded_offsets);
 
-  auto inp_tv = set(inp);
   auto out_tv = preprocessGroupedMatmulInputSf(
-      inp_tv, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
-  // NOTE: output of preprocessGroupedMatmulInputSf needs to be on global
-  // memory, because we do indexing on output inside the runtime function.
+      inp, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
   fusion.addOutput(out_tv);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -212,4 +209,90 @@ TEST_F(LayoutOpTest, SchedulerKernel) {
       t1,
       t2));
 }
+
+TEST_F(LayoutOpTest, SchedulerKernelWithConsumer) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  auto offsets = makeSymbolicTensor(1, DataType::Int32);
+  auto rounded_offsets = makeSymbolicTensor(1, DataType::Int32);
+  fusion.addInput(inp);
+  fusion.addInput(offsets);
+  fusion.addInput(rounded_offsets);
+
+  auto out_tv = preprocessGroupedMatmulInputSf(
+      inp, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
+  fusion.addOutput(out_tv);
+
+  // FIXME: this is undefined and we should error out.
+  // FIXME: add validation for relu_tv.
+  // TODO: consumer of output from PreprocessGroupedMatmulInputSf needs to be segmented, because indexing won't work on lowerSrcIndex. So this needs to be changed into some other operation that would go through expr_eval instead. Maybe a matmul or something like that.
+  auto relu_tv = relu(out_tv);
+  fusion.addOutput(relu_tv);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 12
+  auto t0 = at::randn({m, k}, options);
+  // tokens per group are [100, 150, 262] respectively, so each group would be
+  // padded to multiple of 128. Hence the total output row span would cover a
+  // length of 128 + 256 + 384 = 768.
+  auto t1 = at::tensor({0, 100, 250, 512}, options.dtype(at::kInt));
+  auto t2 = at::tensor({0, 128, 384, 768}, options.dtype(at::kInt));
+
+  // naive scheduling.
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  ASSERT_TRUE(validateGroupedLayout(
+      BlockScalingFactorLayout::Block128x4,
+      outputs[0].as<at::Tensor>(),
+      t0,
+      t1,
+      t2));
+}
+
+TEST_F(LayoutOpTest, SchedulerKernelWithExplicitQuantization) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  auto offsets = makeSymbolicTensor(1, DataType::Int32);
+  auto rounded_offsets = makeSymbolicTensor(1, DataType::Int32);
+  fusion.addInput(inp);
+  fusion.addInput(offsets);
+  fusion.addInput(rounded_offsets);
+
+  auto block_size = IrBuilder::create<Val>(16, DataType::Int);
+  auto remainder = ceilDiv(inp->axis(1)->extent(), block_size);
+
+  auto reshaped_inp = reshape(inp, {inp->axis(0)->extent(), remainder, block_size});
+  auto blocked_sf = max(reshaped_inp, {2});
+  auto scaled_output = div(reshaped_inp, broadcast(blocked_sf, {false, false, true}));
+  // scaled_output = castOp(DataType::Float4_e2m1fn, scaled_output);
+  fusion.addOutput(scaled_output);
+
+  auto out_blocked_sf_fp8 = preprocessGroupedMatmulInputSf(
+      blocked_sf, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
+  // out_blocked_sf_fp8 = castOp(DataType::Float8_e4m3fn, out_blocked_sf_fp8);
+  fusion.addOutput(out_blocked_sf_fp8);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 12
+  auto t0 = at::randn({m, k}, options);
+  // tokens per group are [100, 150, 262] respectively, so each group would be
+  // padded to multiple of 128. Hence the total output row span would cover a
+  // length of 128 + 256 + 384 = 768.
+  auto t1 = at::tensor({0, 100, 250, 512}, options.dtype(at::kInt));
+  auto t2 = at::tensor({0, 128, 384, 768}, options.dtype(at::kInt));
+
+  // naive scheduling.
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+}
+
 } // namespace nvfuser