Merge branch 'boyanl/silu_and_mul' into 'main'

blinxt · blinxt · commit a12c0d8aedc0 · 2025-11-22T09:35:23.000-08:00
add silu_and_mul kernel for MoE benchmark and sample

See merge request dl/tileir/cutile-python!42
diff --git a/samples/MoE.py b/samples/MoE.py
@@ -11,6 +11,7 @@
 
 
 ConstInt = ct.Constant[int]
+ConstBool = ct.Constant[bool]
 
 
 @ct.kernel
@@ -22,7 +23,7 @@ def fused_moe_kernel(
     sorted_token_ids,
     sorted_expert_ids,
     num_token_replicas: int,
-    mul_routed_weight: bool,
+    mul_routed_weight: ConstBool,
     TILE_M: ConstInt,
     TILE_N: ConstInt,
     TILE_K: ConstInt,
@@ -84,9 +85,30 @@ def fused_moe_kernel(
     ct.scatter(C, (token_ids[:, None], c_col_indices[None, :]), accumulator)
 
 
-def silu_and_mul_torch(input: torch.Tensor, out: torch.Tensor):
-    gate_result, up_result = input.chunk(2, dim=-1)
-    torch.mul(F.silu(gate_result), up_result, out=out)
+@ct.kernel
+def silu_and_mul_kernel(A, B, C, TILE_N: ConstInt):
+    """
+    Element-wise kernel that computes SiLU(A) * B.
+
+    Args:
+        A: Input tensor A.
+        B: Input tensor B.
+        C: Output tensor.
+    """
+
+    bid_m = ct.bid(0)
+    ta = ct.load(A, (bid_m, 0), (1, TILE_N)).astype(ct.float32)
+    tb = ct.load(B, (bid_m, 0), (1, TILE_N)).astype(ct.float32)
+
+    # Sigmoid(ta)
+    denom = ct.add(1, ct.exp(-ta), flush_to_zero=True)
+    sigmoid_ta = ct.truediv(1.0, denom, flush_to_zero=True, rounding_mode=ct.RoundingMode.APPROX)
+
+    # SiLU(ta) * tb
+    silu_ta = ct.mul(ta, sigmoid_ta, flush_to_zero=True)
+    tc = ct.mul(silu_ta, tb, flush_to_zero=True)
+
+    ct.store(C, (bid_m, 0), tc.astype(C.dtype))
 
 
 def moe_align_tile_size_torch(
@@ -244,7 +266,7 @@ def cutile_moe(
         tile_k=tile_k,
     )
 
-    silu_and_mul_torch(
+    invoke_silu_and_mul_kernel(
         intermediate_cache1.view(-1, intermediate_cache1.shape[-1]),
         intermediate_cache2,
     )
@@ -353,6 +375,37 @@ def invoke_fused_moe_kernel(
     )
 
 
+def invoke_silu_and_mul_kernel(
+    AB: torch.Tensor,
+    C: torch.Tensor
+):
+    A, B = AB.chunk(2, dim=-1)
+    ct.launch(
+        torch.cuda.current_stream(),
+        (AB.shape[0],),
+        silu_and_mul_kernel,
+        (
+            A,
+            B,
+            C,
+            next_power_of_2(C.shape[-1])
+        )
+    )
+
+
+def next_power_of_2(n: int):
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
diff --git a/samples/templates/MoE.py b/samples/templates/MoE.py
@@ -9,7 +9,7 @@
 import torch.nn.functional as F
 import cuda.tile as ct
 
-from test.kernels.fused_moe import fused_moe_kernel, moe_align_tile_size_torch, silu_and_mul_torch
+from test.kernels.fused_moe import fused_moe_kernel, moe_align_tile_size_torch, silu_and_mul_kernel
 
 
 # --- cuTile MoE Wrapper ------------------------------------------------------
@@ -85,7 +85,7 @@ def cutile_moe(
         tile_k=tile_k,
     )
 
-    silu_and_mul_torch(
+    invoke_silu_and_mul_kernel(
         intermediate_cache1.view(-1, intermediate_cache1.shape[-1]),
         intermediate_cache2,
     )
@@ -194,6 +194,37 @@ def invoke_fused_moe_kernel(
     )
 
 
+def invoke_silu_and_mul_kernel(
+    AB: torch.Tensor,
+    C: torch.Tensor
+):
+    A, B = AB.chunk(2, dim=-1)
+    ct.launch(
+        torch.cuda.current_stream(),
+        (AB.shape[0],),
+        silu_and_mul_kernel,
+        (
+            A,
+            B,
+            C,
+            next_power_of_2(C.shape[-1])
+        )
+    )
+
+
+def next_power_of_2(n: int):
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
diff --git a/test/bench_moe.py b/test/bench_moe.py
@@ -9,8 +9,8 @@
 import cuda.tile as ct
 
 from conftest import dtype_id, shape_id
-from util import estimate_bench_iter
-from kernels.fused_moe import fused_moe_kernel, silu_and_mul_torch, moe_align_tile_size_torch
+from util import estimate_bench_iter, next_power_of_2
+from kernels.fused_moe import fused_moe_kernel, silu_and_mul_kernel, moe_align_tile_size_torch
 
 
 @pytest.fixture(params=[
@@ -186,7 +186,7 @@ def cutile_moe(
         tile_k,
     )
 
-    silu_and_mul_torch(
+    invoke_silu_and_mul_kernel(
         intermediate_cache1.view(-1, intermediate_cache1.shape[-1]),
         intermediate_cache2,
     )
@@ -247,3 +247,21 @@ def invoke_fused_moe_kernel(
             tile_k,
         )
     )
+
+
+def invoke_silu_and_mul_kernel(
+    AB: torch.Tensor,
+    C: torch.Tensor
+):
+    A, B = AB.chunk(2, dim=-1)
+    ct.launch(
+        torch.cuda.current_stream(),
+        (AB.shape[0],),
+        silu_and_mul_kernel,
+        (
+            A,
+            B,
+            C,
+            next_power_of_2(C.shape[-1])
+        )
+    )
diff --git a/test/bench_rms_norm.py b/test/bench_rms_norm.py
@@ -8,26 +8,13 @@
 import torch
 import cuda.tile as ct
 from math import ceil
-from util import estimate_bench_iter
+from util import estimate_bench_iter, next_power_of_2
 from kernels.rms_norm import (
     rms_norm_kernel, rms_norm_kernel_gather, rms_norm_kernel_static_persistent
 )
 from autotuner.autotuner import Autotuner, Config, SearchSpace, autotune
 
 
-def next_power_of_2(n: int):
-    """Return the smallest power of 2 greater than or equal to n"""
-    n -= 1
-    n |= n >> 1
-    n |= n >> 2
-    n |= n >> 4
-    n |= n >> 8
-    n |= n >> 16
-    n |= n >> 32
-    n += 1
-    return n
-
-
 @pytest.fixture(params=[
     (262144, 1024),
     (262144, 2048),
diff --git a/test/kernels/fused_moe.py b/test/kernels/fused_moe.py
@@ -4,11 +4,11 @@
 
 import cuda.tile as ct
 import torch
-import torch.nn.functional as F
 
 from kernels.matmul import swizzle_2d
 
 ConstInt = ct.Constant[int]
+ConstBool = ct.Constant[bool]
 
 
 @ct.kernel
@@ -20,7 +20,7 @@ def fused_moe_kernel(
     sorted_token_ids,
     sorted_expert_ids,
     num_token_replicas: int,
-    mul_routed_weight: bool,
+    mul_routed_weight: ConstBool,
     TILE_M: ConstInt,
     TILE_N: ConstInt,
     TILE_K: ConstInt,
@@ -82,12 +82,33 @@ def fused_moe_kernel(
     ct.scatter(C, (token_ids[:, None], c_col_indices[None, :]), accumulator)
 
 
-# -- PyTorch Utilities --
+@ct.kernel
+def silu_and_mul_kernel(A, B, C, TILE_N: ConstInt):
+    """
+    Element-wise kernel that computes SiLU(A) * B.
+
+    Args:
+        A: Input tensor A.
+        B: Input tensor B.
+        C: Output tensor.
+    """
 
-def silu_and_mul_torch(input: torch.Tensor, out: torch.Tensor):
-    gate_result, up_result = input.chunk(2, dim=-1)
-    torch.mul(F.silu(gate_result), up_result, out=out)
+    bid_m = ct.bid(0)
+    ta = ct.load(A, (bid_m, 0), (1, TILE_N)).astype(ct.float32)
+    tb = ct.load(B, (bid_m, 0), (1, TILE_N)).astype(ct.float32)
 
+    # Sigmoid(ta)
+    denom = ct.add(1, ct.exp(-ta), flush_to_zero=True)
+    sigmoid_ta = ct.truediv(1.0, denom, flush_to_zero=True, rounding_mode=ct.RoundingMode.APPROX)
+
+    # SiLU(ta) * tb
+    silu_ta = ct.mul(ta, sigmoid_ta, flush_to_zero=True)
+    tc = ct.mul(silu_ta, tb, flush_to_zero=True)
+
+    ct.store(C, (bid_m, 0), tc.astype(C.dtype))
+
+
+# -- PyTorch Utilities --
 
 def moe_align_tile_size_torch(
     topk_ids: torch.Tensor, tile_m: int, num_experts: int
diff --git a/test/util.py b/test/util.py
@@ -196,3 +196,16 @@ def get_int_dtype_of_same_size(t: torch.dtype) -> torch.dtype:
         case torch.int16: return torch.int16
         case torch.int8: return torch.int8
         case _: raise NotImplementedError()
+
+
+def next_power_of_2(n: int):
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n