Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
adding more test examples
misc fixes

adding quick note for me to pick up later

break consumer of layout op into separate fusion

err

quick fix on logic

refactor to use resize for transform replay

clear allocation domain

revert resize change for allocation domain; wipe  out allocation for layout op on fusion segment boundaries

trying to patch allocation domain handling during allocation

fix cases where allocation domain isn't available

fixing allocation transform and fixing tests
  • Loading branch information
jjsjann123 committed Sep 17, 2025
commit 303f27dd4fb6e004f9982b1c8a427f77205acde3
5 changes: 5 additions & 0 deletions csrc/fusion_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1893,6 +1893,11 @@ std::pair<IrCloner, std::unique_ptr<Fusion>> SegmentedFusion::makeFusion(
if (inp->isDefinitionType<ReshapeOp>()) {
NVF_ERROR(clone_tv != nullptr && clone_tv->isA<TensorView>());
view_tvs.push_back(clone_tv->as<TensorView>());
} else if (inp->isDefinitionType<PreprocessGroupedMatmulInputSf>()) {
// There's no point of replaying allocation domain if we cannot index into TV anyway.
// TODO: check all uses are safe
auto* tv_ptr = clone_tv->as<TensorView>();
tv_ptr->setAllocationDomain(tv_ptr->getLogicalDomain(), true);
}
}

Expand Down
6 changes: 6 additions & 0 deletions csrc/ops/indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,12 @@ TensorView* preprocessGroupedMatmulInputSf(
auto pad_to_max_extent = [&](IterDomain* id, int multiple) -> IterDomain* {
auto* maximum_pad_value_per_group =
IrBuilder::create<Val>(multiple - 1, DataType::Index);

// NOTE: resize sounds good in theory. In reality, we cannot index this operation anyway, so I question how much a resize op is buying us.
// More importantly, resize still hits asserts in vectorization analysis (validateDeviceSplit ATM).
// I think this requires the expanded dimension to be constant, in order to transformation replay to be work properly. We'll rely on concretization to convert number of groups to be a constant value.
// return IterDomain::resize(id, input->fusion()->zeroVal(DataType::Index), SimplifyingIrBuilder::mulExpr(num_groups, maximum_pad_value_per_group));

Val* padded_ext = SimplifyingIrBuilder::addExpr(
id->extent(),
SimplifyingIrBuilder::mulExpr(num_groups, maximum_pad_value_per_group));
Expand Down
6 changes: 6 additions & 0 deletions csrc/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
return false;
}

if (registry_utils::SchedulerTopologyChecker::hasConsumerOfNonIndexableOps(fusion)) {
scheduler_debug_utils::canScheduleRejectReason(
scheduler_type, "Fusion has consumer of non indexable ops.");
return false;
}

return true;
}

Expand Down
11 changes: 11 additions & 0 deletions csrc/scheduler/registry_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,17 @@ bool SchedulerTopologyChecker::hasResizeAndIndexOps(Fusion* fusion) {
return false;
}

bool SchedulerTopologyChecker::hasConsumerOfNonIndexableOps(Fusion* fusion) {
for (auto expr : fusion->exprs()) {
if (expr->isOneOf<PreprocessGroupedMatmulInputSf>()) {
if (!ir_utils::getTvOutput(expr)->uses().empty()) {
return true;
}
}
}
return false;
}

namespace {

// Return true when there's a producer-consumer relationship among a
Expand Down
2 changes: 2 additions & 0 deletions csrc/scheduler/registry_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ class SchedulerTopologyChecker {

static bool hasResizeAndIndexOps(Fusion* fusion);

static bool hasConsumerOfNonIndexableOps(Fusion* fusion);

// Checks if a series of reshape ops creates a cycle in the ID
// graph. It is not currently supported. For example,
// propagateReshapeTransforms won't work as it won't find any
Expand Down
91 changes: 87 additions & 4 deletions tests/cpp/test_layout_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,8 @@ TEST_F(LayoutOpTest, SchedulerKernel) {
fusion.addInput(offsets);
fusion.addInput(rounded_offsets);

auto inp_tv = set(inp);
auto out_tv = preprocessGroupedMatmulInputSf(
inp_tv, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
// NOTE: output of preprocessGroupedMatmulInputSf needs to be on global
// memory, because we do indexing on output inside the runtime function.
inp, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
fusion.addOutput(out_tv);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
Expand All @@ -212,4 +209,90 @@ TEST_F(LayoutOpTest, SchedulerKernel) {
t1,
t2));
}

TEST_F(LayoutOpTest, SchedulerKernelWithConsumer) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

auto inp = makeSymbolicTensor(2);
auto offsets = makeSymbolicTensor(1, DataType::Int32);
auto rounded_offsets = makeSymbolicTensor(1, DataType::Int32);
fusion.addInput(inp);
fusion.addInput(offsets);
fusion.addInput(rounded_offsets);

auto out_tv = preprocessGroupedMatmulInputSf(
inp, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
fusion.addOutput(out_tv);

// FIXME: this is undefined and we should error out.
// FIXME: add validation for relu_tv.
// TODO: consumer of output from PreprocessGroupedMatmulInputSf needs to be segmented, because indexing won't work on lowerSrcIndex. So this needs to be changed into some other operation that would go through expr_eval instead. Maybe a matmul or something like that.
auto relu_tv = relu(out_tv);
fusion.addOutput(relu_tv);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
int m = 512;
int k = 9; // note: padded column size would be 12
auto t0 = at::randn({m, k}, options);
// tokens per group are [100, 150, 262] respectively, so each group would be
// padded to multiple of 128. Hence the total output row span would cover a
// length of 128 + 256 + 384 = 768.
auto t1 = at::tensor({0, 100, 250, 512}, options.dtype(at::kInt));
auto t2 = at::tensor({0, 128, 384, 768}, options.dtype(at::kInt));

// naive scheduling.
FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});

ASSERT_TRUE(validateGroupedLayout(
BlockScalingFactorLayout::Block128x4,
outputs[0].as<at::Tensor>(),
t0,
t1,
t2));
}

TEST_F(LayoutOpTest, SchedulerKernelWithExplicitQuantization) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

auto inp = makeSymbolicTensor(2);
auto offsets = makeSymbolicTensor(1, DataType::Int32);
auto rounded_offsets = makeSymbolicTensor(1, DataType::Int32);
fusion.addInput(inp);
fusion.addInput(offsets);
fusion.addInput(rounded_offsets);

auto block_size = IrBuilder::create<Val>(16, DataType::Int);
auto remainder = ceilDiv(inp->axis(1)->extent(), block_size);

auto reshaped_inp = reshape(inp, {inp->axis(0)->extent(), remainder, block_size});
auto blocked_sf = max(reshaped_inp, {2});
auto scaled_output = div(reshaped_inp, broadcast(blocked_sf, {false, false, true}));
// scaled_output = castOp(DataType::Float4_e2m1fn, scaled_output);
fusion.addOutput(scaled_output);

auto out_blocked_sf_fp8 = preprocessGroupedMatmulInputSf(
blocked_sf, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
// out_blocked_sf_fp8 = castOp(DataType::Float8_e4m3fn, out_blocked_sf_fp8);
fusion.addOutput(out_blocked_sf_fp8);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
int m = 512;
int k = 9; // note: padded column size would be 12
auto t0 = at::randn({m, k}, options);
// tokens per group are [100, 150, 262] respectively, so each group would be
// padded to multiple of 128. Hence the total output row span would cover a
// length of 128 + 256 + 384 = 768.
auto t1 = at::tensor({0, 100, 250, 512}, options.dtype(at::kInt));
auto t2 = at::tensor({0, 128, 384, 768}, options.dtype(at::kInt));

// naive scheduling.
FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
}

} // namespace nvfuser
Loading