[Optimus][Auto-AC] Support activation quantization

mengluy0125 · facebook-github-bot · commit 059a135a60df · 2025-03-03T15:11:10.000-08:00
Summary: We enable the activation quantization in the forward pass, and users can customize the dtype they want to quantize. Test Plan: # unit test ``` buck2 test 'fbcode//mode/dev-nosan' fbcode//caffe2/test/inductor:quantization -- test_activation_quantization_aten ``` Buck UI: https://www.internalfb.com/buck2/776d3911-bb86-4ac8-a527-540cf1510b9d Test UI: https://www.internalfb.com/intern/testinfra/testrun/4785074873051017 Network: Up: 4.3MiB Down: 42MiB (reSessionID-fef7e727-68b1-4645-a519-5652854df38d) Executing actions. Remaining 0/4 6.7s exec time total Command: test. Finished 2 local Time elapsed: 3:11.5s Tests finished: Pass 2. Fail 0. Fatal 0. Skip 0. Build failure 0 # E2E ### how to enable (you can overrite the dtype, if nothing given, the default is fp8) ``` post_grad_fusion_options={ "activation_quantization_aten_pass": {"quant_type": torch.float8_e5m2} }, ``` Differential Revision: D70522237
diff --git a/test/inductor/test_quantization.py b/test/inductor/test_quantization.py
@@ -0,0 +1,81 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch._inductor
+import torch._inductor.fx_passes.group_batch_fusion
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
+
+
+class TargetCPModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x1, x2):
+        relued = torch.relu(x1)
+        tanhed = torch.tanh(relued)
+        tensor = torch.matmul(
+            tanhed,
+            x2,
+        )
+        return tensor
+
+
+class TestQuantization(TestCase):
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+        if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
+            return False
+        for key1 in ref_dict.keys():
+            key2 = "_orig_mod." + key1
+            assert key2 in res_dict, f"{key1} does not exist in traced module"
+            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol):
+                return False
+        return True
+
+    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+        ref = module(*input)
+        res = traced(*input)
+        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+
+    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_params = dict(module.named_parameters())
+        res_params = dict(traced.named_parameters())
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+
+    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_grad = {key: param.grad for key, param in module.named_parameters()}
+        res_grad = {key: param.grad for key, param in traced.named_parameters()}
+        self.assertTrue(
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+        )
+
+    @requires_gpu()
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={
+            "activation_quantization_aten_pass": {"quant_type": torch.float8_e5m2}
+        },
+    )
+    def test_activation_quantization_aten(self):
+        counters.clear()
+        module = TargetCPModule().to(GPU_TYPE)
+        input = [
+            torch.rand((16, 10), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16),
+            torch.rand((10, 16), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(counters["inductor"]["activation_quantization_aten_pass"], 2)
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -362,16 +362,20 @@ def _recursive_joint_graph_passes(gm: GraphModule) -> None:
         joint_graph_passes(gm)
 
 
-def _recursive_post_grad_passes(gm: GraphModule, is_inference: bool = False) -> None:
+def _recursive_post_grad_passes(
+    gm: GraphModule,
+    is_inference: bool = False,
+    is_backward: bool = False,
+    ) -> None:
     with dynamo_timed(
         "_recursive_post_grad_passes",
         log_pt2_compile_event=True,
         dynamo_compile_column_us="post_grad_pass_time_us",
     ):
         for subgraph_name in _get_subgraph_names(gm):
             subgraph = getattr(gm, subgraph_name)
-            _recursive_post_grad_passes(subgraph, is_inference)
-        post_grad_passes(gm, is_inference)
+            _recursive_post_grad_passes(subgraph, is_inference, is_backward)
+        post_grad_passes(gm, is_inference, is_backward)
 
 
 def split_const_gm(
@@ -990,7 +994,7 @@ def log_graph_runnable() -> str:
                 # has some issues with memory in training
                 cuda_context = get_cuda_device_context(gm)
                 with cuda_context:
-                    _recursive_post_grad_passes(gm, is_inference=is_inference)
+                    _recursive_post_grad_passes(gm, is_inference=is_inference, is_backward=is_backward)
                 V.debug.fx_graph_transformed(gm, example_inputs)
                 post_grad_graphs_log.debug(
                     "%s",
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
@@ -70,7 +70,7 @@
 ]
 
 
-def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
+def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool, is_backward: bool):
     """
     Passes that run on after grad.  This is called once on the forwards
     graph and once on the backwards graph.
@@ -130,6 +130,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             if pass_name in POST_GRAD_FUSIONS:
                 continue
             pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]
+            pattern_matcher_pass.is_backward = is_backward
             inductor_before_change = save_inductor_dict(
                 [pattern_matcher_pass.pass_name]
             )
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
@@ -11,12 +11,27 @@
 from torch._dynamo.utils import counters
 from torch.fx.experimental.symbolic_shapes import has_free_symbols
 from torch.fx.node import map_arg
+from torch.fx.passes.shape_prop import TensorMetadata
 
 from ..lowering import lowerings as L, require_channels_last
-from ..pattern_matcher import Arg, CallFunction, filter_nodes, KeywordArg, ListOf, Match
+from ..pattern_matcher import (
+    Arg,
+    CallFunction,
+    CallFunctionVarArgs,
+    filter_nodes,
+    is_backward_pattern,
+    KeywordArg,
+    ListOf,
+    Match,
+    MULTIPLE,
+    Placeholder,
+    register_graph_pattern,
+)
 from ..utils import pad_listlike
 from .freezing_patterns import register_freezing_graph_pattern
+from .group_batch_fusion import is_node_meta_valid
 from .post_grad import register_lowering_pattern
+from .split_cat import construct_pattern_matcher_pass
 
 
 aten = torch.ops.aten
@@ -3590,3 +3605,100 @@ def maybe_replace_node(n: torch.fx.Node) -> torch.fx.Node:
 
     graph_module.graph.lint()
     graph_module.recompile()
+
+
+activation_quantization_aten_pass = construct_pattern_matcher_pass(
+    "activation_quantization_aten_pass"
+)
+@register_graph_pattern(
+    CallFunctionVarArgs(
+        [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.tanh.default,
+            torch.ops.aten.sigmoid.default,
+            torch.ops.aten.gelu.default,
+        ],
+        users=MULTIPLE
+    ),
+    pass_dict=activation_quantization_aten_pass,
+    extra_check=is_backward_pattern(activation_quantization_aten_pass, False),
+)
+def quantize_activation_fw(match: Match, *args, **kwargs):
+    graph = match.graph
+    activation_nodes = match.nodes
+    quant_type = torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("quant_type", torch.float8_e5m2)
+    for activation_node in activation_nodes:
+        # check if the activation node is the return node
+        users = list(activation_node.users.keys())
+        for user in users:
+            # check if the user is the return node
+            if user.op == "output":
+                if not is_node_meta_valid(activation_node):
+                    continue
+                # we need to insert a quantization node after it
+                with graph.inserting_after(activation_node):
+                    quant_activation_node = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(activation_node, quant_type)
+                    )
+                    quant_activation_node.meta.update(activation_node.meta)
+                    quant_activation_node.meta["val"] = quant_activation_node.meta["val"].to(quant_type)
+                    quant_activation_node.meta["tensor_meta"] = TensorMetadata(
+                        shape=quant_activation_node.meta["tensor_meta"].shape,
+                        dtype=quant_type,
+                        requires_grad=quant_activation_node.meta["tensor_meta"].requires_grad,
+                        stride=quant_activation_node.meta["tensor_meta"].stride,
+                        memory_format=quant_activation_node.meta["tensor_meta"].memory_format,
+                        is_quantized=quant_activation_node.meta["tensor_meta"].is_quantized,
+                        qparams=quant_activation_node.meta["tensor_meta"].qparams,
+                    )
+                    # only update the return node args, and remain all other users unchanged
+                    user_updated_args = tuple(
+                        quant_activation_node if node == activation_node else node for node in user.args[0]
+                    )
+                    user.update_arg(0, user_updated_args)
+                    if len(activation_node.users) == 0:
+                        graph.erase_node(activation_node)
+                counters["inductor"]["activation_quantization_aten_pass"] += 1
+                break
+
+
+@register_graph_pattern(
+    Placeholder(["tanh", "relu", "sigmoid", "gelu"], users=MULTIPLE),
+    pass_dict=activation_quantization_aten_pass,
+    extra_check=is_backward_pattern(activation_quantization_aten_pass, True),
+)
+def quantize_activation_bw(match: Match, *args, **kwargs):
+    graph = match.graph
+    inputs = match.nodes
+    quant_type = torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("quant_type", torch.float8_e5m2)
+    for input in inputs:
+        if not is_node_meta_valid(input):
+            continue
+        # we need to insert a dequantization node after it
+        with graph.inserting_after(input):
+            dequant_activation_node = graph.call_function(
+                torch.ops.prims.convert_element_type.default,
+                args=(input, input.meta["val"].dtype)
+            )
+        input.replace_all_uses_with(dequant_activation_node)
+        # restore dequant_activation_node input
+        dequant_activation_node.replace_input_with(dequant_activation_node, input)
+        dequant_activation_node.meta.update(input.meta)
+        
+        # replace the input with quant type to keep sync with forward pass
+        input.meta["val"] = input.meta["val"].to(quant_type)
+        input.meta["tensor_meta"] = TensorMetadata(
+            shape=input.meta["tensor_meta"].shape,
+            dtype=quant_type,
+            requires_grad=input.meta["tensor_meta"].requires_grad,
+            stride=input.meta["tensor_meta"].stride,
+            memory_format=input.meta["tensor_meta"].memory_format,
+            is_quantized=input.meta["tensor_meta"].is_quantized,
+            qparams=input.meta["tensor_meta"].qparams,
+        )
+        counters["inductor"]["activation_quantization_aten_pass"] += 1
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
@@ -71,6 +71,7 @@
     "pad_aten_mm_pass",
     "split_cat_aten_pass",
     "select_cat_aten_pass",
+    "activation_quantization_aten_pass",
 ]
 
 for pass_name in pre_grad_pass_names:
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
@@ -778,6 +778,10 @@ class CallModuleVarArgs(_TargetExprVarArgs):
     op = "call_module"
 
 
+class Placeholder(_TargetExprVarArgs):
+    op = "placeholder"
+
+
 class ListOf(PatternExpr):
     """
     Matches a repeated pattern
@@ -1794,12 +1798,14 @@ class PatternMatcherPass:
     def __init__(
         self,
         pass_name: Optional[str] = None,
+        is_backward: Optional[bool] = None,
     ) -> None:
         super().__init__()
         self.patterns: defaultdict[
             tuple[str, torch.fx.node.Target], list[PatternEntry]
         ] = defaultdict(list)
         self.pass_name = pass_name
+        self.is_backward = is_backward
 
         # For a particular generated pattern repr, store all of the str representations
         # of the graph used to generate them. Because we ignore certain patterns
@@ -2129,6 +2135,16 @@ def flag_check(match: Match) -> Any:
     return flag_check
 
 
+def is_backward_pattern(pattern: PatternMatcherPass, backward: bool = True) -> Callable[[Match], Any]:
+    """Function for extra_check to check if it is a pattern for backward only"""
+    def backward_check(match: Match) -> Any:
+        if backward:
+            return pattern.is_backward
+        else:
+            return not pattern.is_backward
+    return backward_check
+
+
 def clone_graph(input_graph: torch.fx.GraphModule) -> torch.fx.GraphModule:
     class CopyGraph(Transformer):
         def run_node(self, old_node: torch.fx.Node) -> torch.fx.Node:

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@`
`70`	`70`	`]`
`71`	`71`
`72`	`72`
`73`		`-def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):`
	`73`	`+def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool, is_backward: bool):`
`74`	`74`	`"""`
`75`	`75`	`Passes that run on after grad. This is called once on the forwards`
`76`	`76`	`graph and once on the backwards graph.`
`@@ -130,6 +130,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):`
`130`	`130`	`if pass_name in POST_GRAD_FUSIONS:`
`131`	`131`	`continue`
`132`	`132`	`pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]`
	`133`	`+ pattern_matcher_pass.is_backward = is_backward`
`133`	`134`	`inductor_before_change = save_inductor_dict(`
`134`	`135`	`[pattern_matcher_pass.pass_name]`
`135`	`136`	`)`
Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@`
`71`	`71`	`"pad_aten_mm_pass",`
`72`	`72`	`"split_cat_aten_pass",`
`73`	`73`	`"select_cat_aten_pass",`
	`74`	`+ "activation_quantization_aten_pass",`
`74`	`75`	`]`
`75`	`76`
`76`	`77`	`for pass_name in pre_grad_pass_names:`