Fix divide-by-zero in GroupNorm two-pass kernel for large batch sizes (#1984)

yuantailing · web-flow · commit dbe421ed0969 · 2026-03-05T14:51:25.000+09:00
When batch size N is large enough (e.g., N=512 with C=640), the heuristic
`blocks_per_act_slice = 256 / params.n` truncates to 0 via integer division,
causing a subsequent `div_up(params.hw, blocks_per_act_slice)` to divide by
zero. Fix by clamping blocks_per_act_slice to at least 1 in both forward and
backward two-pass setup functions.

Add regression test covering the exact repro case and all three heuristic
branches.

Signed-off-by: Tailing Yuan &lt;yuantailing@gmail.com&gt;
diff --git a/apex/contrib/csrc/group_norm/group_norm_nhwc_bwd_two_pass.cu b/apex/contrib/csrc/group_norm/group_norm_nhwc_bwd_two_pass.cu
@@ -204,6 +204,9 @@ void group_norm_nhwc_bwd_two_passes_setup(Group_norm_nhwc_bwd_params& params, si
     blocks_per_act_slice = 512 / params.n;
   }
 
+  // Clamp to at least 1 to avoid divide-by-zero when batch size is large.
+  blocks_per_act_slice = max(blocks_per_act_slice, 1);
+
   // Make sure we launch blocks per activation is no less than activations
   blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));
 
diff --git a/apex/contrib/csrc/group_norm/group_norm_nhwc_fwd_two_pass.cu b/apex/contrib/csrc/group_norm/group_norm_nhwc_fwd_two_pass.cu
@@ -126,6 +126,9 @@ void group_norm_nhwc_fwd_two_passes_setup(Group_norm_nhwc_fwd_params& params, si
     blocks_per_act_slice = 512 / params.n;
   }
 
+  // Clamp to at least 1 to avoid divide-by-zero when batch size is large.
+  blocks_per_act_slice = max(blocks_per_act_slice, 1);
+
   // Make sure we launch blocks per activation is no less than activations
   blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));
 
diff --git a/apex/contrib/test/group_norm/test_group_norm.py b/apex/contrib/test/group_norm/test_group_norm.py
@@ -280,6 +280,41 @@ def test_16_groups(self):
                     )
                 self.verify_group_norm(GroupNorm, N=n, C=c, H=h, W=w, G=16, act="swish")
 
+    def test_large_batch_two_pass(self):
+        """Regression test for divide-by-zero when batch size is large.
+
+        When batch_size >= 256 and c >= 640, blocks_per_act_slice = 256 / n
+        truncates to 0, causing div_up(hw, 0). Test all three heuristic branches.
+        """
+        sizes = [
+            [256, 1280, 8, 8],
+            [512, 640, 16, 16],
+            [1024, 512, 8, 8],
+        ]
+        for sz in sizes:
+            with self.subTest(size=sz):
+                n, c, h, w = sz
+                required = _estimate_group_norm_test_bytes(
+                    N=n,
+                    C=c,
+                    H=h,
+                    W=w,
+                    xdtype=torch.float16,
+                    wdtype=torch.float32,
+                    ref_func=torch_group_norm_high_precision_fp64,
+                )
+                if not _has_sufficient_cuda_memory(required):
+                    free_bytes, total_bytes = torch.cuda.mem_get_info()
+                    raise unittest.SkipTest(
+                        f"Skipping large-batch GroupNorm case {sz}: estimated "
+                        f"{required / 1e9:.1f} GB requires more than available "
+                        f"free VRAM ({free_bytes / 1e9:.1f} GB free, "
+                        f"{total_bytes / 1e9:.1f} GB total)."
+                    )
+                self.verify_group_norm(
+                    cuda_group_norm_nhwc_two_pass, N=n, C=c, H=h, W=w, G=32, act="silu"
+                )
+
     def test_fp16_parameters(self):
         n, c, h, w = 8, 2560, 16, 16
         self.verify_group_norm(

Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,9 @@ void group_norm_nhwc_bwd_two_passes_setup(Group_norm_nhwc_bwd_params& params, si`
`204`	`204`	`blocks_per_act_slice = 512 / params.n;`
`205`	`205`	`}`
`206`	`206`
	`207`	`+ // Clamp to at least 1 to avoid divide-by-zero when batch size is large.`
	`208`	`+ blocks_per_act_slice = max(blocks_per_act_slice, 1);`
	`209`	`+`
`207`	`210`	`// Make sure we launch blocks per activation is no less than activations`
`208`	`211`	`blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));`
`209`	`212`
Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,9 @@ void group_norm_nhwc_fwd_two_passes_setup(Group_norm_nhwc_fwd_params& params, si`
`126`	`126`	`blocks_per_act_slice = 512 / params.n;`
`127`	`127`	`}`
`128`	`128`
	`129`	`+ // Clamp to at least 1 to avoid divide-by-zero when batch size is large.`
	`130`	`+ blocks_per_act_slice = max(blocks_per_act_slice, 1);`
	`131`	`+`
`129`	`132`	`// Make sure we launch blocks per activation is no less than activations`
`130`	`133`	`blocks_per_act_slice = min(blocks_per_act_slice, div_up(params.hw, params.n));`
`131`	`134`