Add the warning of distributed_fused_adam low bucket usage (#1714)

shjwudp · timmoon10 · web-flow · commit 1d01e5c7eb4d · 2023-08-29T05:49:04.000+09:00
* feat: Add the warning of distributed_fused_adam low bucket usage.

* correct unittest

* Update apex/contrib/optimizers/distributed_fused_adam.py

Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;

---------

Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
diff --git a/apex/contrib/optimizers/distributed_fused_adam.py b/apex/contrib/optimizers/distributed_fused_adam.py
@@ -393,6 +393,8 @@ def __init__(
             self.shard_size: int = shard_size
             # Size of the filled region in the bucket
             self.filled_size: int = 0
+            # Is it able to continue filling
+            self.able_to_fill: bool = True
             # Offset to bucket in contiguous buffers
             self.contiguous_buffer_offset: int = contiguous_buffer_offset
             # Buffer ranges corresponding to parameter fragments
@@ -1037,6 +1039,21 @@ def init_params(
                 param_group_id, param_id = id_map[param]
                 self._init_param_state(param, param_group_id, param_id)
 
+        num_params = sum(1 for param in self.parameters())
+        num_initialized_params = sum(
+            1 for param in self.parameters()
+            if "fragments" in self.state[param]
+        )
+        if num_initialized_params == num_params:
+            bucket_size = sum(bucket.bucket_size for bucket in self.state["buckets"])
+            filled_size = sum(bucket.filled_size for bucket in self.state["buckets"])
+            buckets_utilization = filled_size / bucket_size
+            if buckets_utilization < 0.7:
+                warnings.warn(
+                    f"Only {buckets_utilization:.1%} of buckets are used. "
+                    "Consider decreasing the bucket_cap_mb argument."
+                )
+
     def init_params_bucket(self, params: Iterable[torch.nn.Parameter]) -> None:
         """Initialize optimizer state for parameters in one effective bucket
 
@@ -1065,7 +1082,7 @@ def init_params_bucket(self, params: Iterable[torch.nn.Parameter]) -> None:
 
         # Mark existings bucket as fully filled
         for bucket in self.state["buckets"]:
-            bucket.filled_size = bucket.bucket_size
+            bucket.able_to_fill = False
 
         # Initialize optimizer state for parameters
         start_bucket_id = len(self.state["buckets"])
@@ -1076,7 +1093,7 @@ def init_params_bucket(self, params: Iterable[torch.nn.Parameter]) -> None:
         for bucket_id in range(start_bucket_id, end_bucket_id):
             bucket = self.state["buckets"][bucket_id]
             bucket_size = bucket.bucket_size
-            bucket.filled_size = bucket_size
+            bucket.able_to_fill = False
             ids_in_bucket = set(
                 (fragment.param_group_id, fragment.param_id)
                 for fragment in bucket.fragments
@@ -1151,7 +1168,7 @@ def _init_param_state(
             bucket_end = bucket_start + fragment_size
 
             # Create new bucket if current one is full
-            if fragment_size <= 0:
+            if fragment_size <= 0 or not bucket.able_to_fill:
                 shard_size = self.default_shard_size
                 bucket_size = shard_size * self.distributed_size
                 buffer_offset = bucket.contiguous_buffer_offset + bucket.bucket_size
diff --git a/apex/contrib/test/optimizers/test_dist_adam.py b/apex/contrib/test/optimizers/test_dist_adam.py
@@ -2,6 +2,7 @@
 import io
 from typing import Optional, Tuple
 import unittest
+import warnings
 
 import torch
 from torch.testing._internal import common_utils
@@ -43,6 +44,7 @@ def make_models(
         contiguous_buffers=False,
         store_params=False,
         store_param_remainders=False,
+        bucket_cap_mb=71/(4*1024*1024),
 ):
 
     # Construct models with same parameters
@@ -82,7 +84,7 @@ def make_models(
         adam_w_mode=adam_w_mode,
         overlap_grad_sync=overlap_communication,
         overlap_param_sync=overlap_communication,
-        bucket_cap_mb=71/(4*1024*1024),
+        bucket_cap_mb=bucket_cap_mb,
         dtype=optim_dtype,
         grad_sync_dtype=grad_sync_dtype,
         param_sync_dtype=param_sync_dtype,
@@ -131,6 +133,7 @@ def test_matches_pytorch(
             contiguous_buffers=False,
             store_params=False,
             store_param_remainders=False,
+            bucket_cap_mb=71/(4*1024*1024),
     ):
 
         torch.manual_seed(self.seed + self.rank)
@@ -149,6 +152,7 @@ def test_matches_pytorch(
             contiguous_buffers=contiguous_buffers,
             store_params=store_params,
             store_param_remainders=store_param_remainders,
+            bucket_cap_mb=bucket_cap_mb,
         )
 
         # Training loop
@@ -678,6 +682,32 @@ def test_checkpoint_bf16(self):
             ),
         )
 
+    def test_bucket_low_utilization_warning(self):
+        """Test warning when bucket utilization is low"""
+        layer_size = 2*1024*1024
+        num_layers = 4
+        fairish_bucket_cap_mb = 4*num_layers*layer_size/(1024*1024)
+
+        # Check that warning is raised when bucket utilization is low
+        with self.assertWarnsRegex(Warning, ".*Consider decreasing the bucket_cap_mb argument."):
+            self.test_matches_pytorch(
+                num_layers=num_layers,
+                layer_size=layer_size,
+                bucket_cap_mb=fairish_bucket_cap_mb * 2,
+                contiguous_buffers=True,
+            )
+
+        # Check that warning is not raised when bucket utilization is high
+        with warnings.catch_warnings(record=True) as warns:
+            self.test_matches_pytorch(
+                num_layers=num_layers,
+                layer_size=layer_size,
+                bucket_cap_mb=fairish_bucket_cap_mb,
+                contiguous_buffers=True,
+            )
+            for w in warns:
+                self.assertNotRegex(str(w.message), ".*Consider decreasing the bucket_cap_mb argument.")
+
 
 if __name__ == "__main__":
     # Assume script has been run with torchrun