Merge branch 'qiqix/argmax_to_int32' into 'main'

xiaoqiqi177 · xiaoqiqi177 · commit b254132afaa9 · 2025-11-25T10:28:02.000-08:00
Change result type for argreduce ops from int64 to int32

See merge request dl/tileir/cutile-python!45
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -2849,7 +2849,7 @@ def argreduce(fn: str, x: Var, axis: Optional[int], keepdims: bool) -> Var:
 
     x_dtype = datatype.default_int_type if datatype.is_boolean(x_type.dtype) else x_type.dtype
     x = _promote_and_broadcast_to(x, TileTy(x_dtype, x_shape))
-    output_dtype = datatype.int64
+    output_dtype = datatype.default_int_type
     output_shape = TupleTy([]) if axis is None else TupleTy(x_shape[:axis] + x_shape[axis + 1:])
     x = add_operation(
         TileArgReduce, TileTy(output_dtype, output_shape),
diff --git a/test/test_reduction.py b/test/test_reduction.py
@@ -344,7 +344,7 @@ def test_reduce_flush_to_zero(shape, tile, dtype, reduce_op, tile_op, flush_to_z
 @pytest.mark.parametrize("reduce_op, torch_op", argmaxmin_cases)
 def test_reduce_argmaxmin(shape, tile, dtype, keepdims, reduce_op, torch_op):
     x = make_tensor(shape, dtype=dtype, device='cuda')
-    y = _squeezed_zeros_like(x, axis=1, keepdims=keepdims).to(torch.int64)
+    y = _squeezed_zeros_like(x, axis=1, keepdims=keepdims).to(torch.int32)
     grid = (ceil(shape[0] / tile), 1, 1)
     if len(shape) == 2:
         kernel = make_reduce_axis1_2d(reduce_op)
@@ -353,7 +353,7 @@ def test_reduce_argmaxmin(shape, tile, dtype, keepdims, reduce_op, torch_op):
         kernel = make_reduce_axis1_3d(reduce_op)
         args = (x, y, tile, shape[1], shape[2], keepdims)
     ct.launch(torch.cuda.current_stream(), grid, kernel, args)
-    ref_result = torch_op(x, dim=1, keepdim=keepdims).to(torch.int64)
+    ref_result = torch_op(x, dim=1, keepdim=keepdims).to(torch.int32)
     assert_equal(y, ref_result)
 
 
@@ -366,11 +366,11 @@ def test_reduce_argmaxmin_all_axes(shape, dtype, reduce_op, torch_op, keepdims):
     grid = (1, 1, 1)
     kernel = make_reduce_axisNone(reduce_op)
     if keepdims:
-        y = _squeezed_zeros_like(x, axis=None, keepdims=keepdims).to(torch.int64)
+        y = _squeezed_zeros_like(x, axis=None, keepdims=keepdims).to(torch.int32)
         ct.launch(torch.cuda.current_stream(), grid, kernel, (x, y, shape[0], shape[1], keepdims))
     else:
-        y = torch.zeros((1,) * len(shape), dtype=dtype, device="cuda").to(torch.int64)
+        y = torch.zeros((1,) * len(shape), dtype=dtype, device="cuda").to(torch.int32)
         ct.launch(torch.cuda.current_stream(), grid, kernel, (x, y, shape[0], shape[1], keepdims))
         y = y.squeeze()
-    ref_result = torch_op(x, dim=None, keepdim=keepdims).to(torch.int64)
+    ref_result = torch_op(x, dim=None, keepdim=keepdims).to(torch.int32)
     assert_equal(y, ref_result)