pytorch
diff --git a/‎test/inductor/test_quantization.py‎
Lines changed: 134 additions & 0 deletions b/‎test/inductor/test_quantization.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎torch/_functorch/partitioners.py‎
Lines changed: 8 additions & 0 deletions b/‎torch/_functorch/partitioners.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎torch/_inductor/compile_fx.py‎
Lines changed: 10 additions & 4 deletions b/‎torch/_inductor/compile_fx.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎torch/_inductor/fx_passes/post_grad.py‎
Lines changed: 2 additions & 1 deletion b/‎torch/_inductor/fx_passes/post_grad.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,134 @@
+# Owner(s): ["module: inductor"]
+
+import logging
+
+import numpy as np
+import torch
+import torch._inductor
+import torch._inductor.fx_passes.group_batch_fusion
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
+
+log = logging.getLogger(__name__)
+
+
+class TargetCPModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x1, x2):
+        relued = torch.relu(x1)
+        tanhed = torch.tanh(relued)
+        tensor = torch.matmul(
+            tanhed,
+            x2,
+        )
+        return tensor
+
+
+class FeedforwardNN(torch.nn.Module):
+    def __init__(self):
+        super(FeedforwardNN, self).__init__()
+        self.fc1 = torch.nn.Linear(1, 64)
+        self.fc2 = torch.nn.Linear(64, 64)
+        self.fc3 = torch.nn.Linear(64, 64)
+        self.fc4 = torch.nn.Linear(64, 1)
+
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        tanh_x = torch.tanh(x)
+        x = torch.relu(self.fc2(x))
+        x = torch.relu(self.fc3(tanh_x))
+        x = self.fc4(x)
+        return x
+
+
+class TestQuantization(TestCase):
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+        if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
+            return False
+        for key1 in ref_dict.keys():
+            key2 = "_orig_mod." + key1
+            assert key2 in res_dict, f"{key1} does not exist in traced module"
+            # if both of them are None, continue           
+            if (
+                not isinstance(ref_dict[key1], torch.Tensor)
+                and not isinstance(res_dict[key2], torch.Tensor)
+                and ref_dict[key1] is None
+                and res_dict[key2] is None
+            ):
+                log.info(f"None found with key1 and value 1: {key1, ref_dict[key1]}, key2 and value2 {key2, res_dict[key2]}")
+                continue
+            elif not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol, equal_nan=True):
+                log.info(f"gradient mismatch for eager and compiled modules, with eager: {ref_dict[key1]} and compiled: {res_dict[key2]}")
+                return False
+        return True
+
+    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+        ref = module(*input)
+        res = traced(*input)
+        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+
+    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_params = dict(module.named_parameters())
+        res_params = dict(traced.named_parameters())
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+
+    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_grad = {key: param.grad for key, param in module.named_parameters()}
+        res_grad = {key: param.grad for key, param in traced.named_parameters()}
+        self.assertTrue(
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+        )
+
+    @requires_gpu()
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={
+            "activation_quantization_aten_pass": {"quant_type": torch.float8_e5m2}
+        },
+    )
+    def test_activation_quantization_aten(self):
+        counters.clear()
+        module = TargetCPModule().to(GPU_TYPE)
+        input = [
+            torch.rand(
+                (16, 10), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16
+            ),
+            torch.rand(
+                (10, 16), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16
+            ),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(counters["inductor"]["activation_quantization_aten_pass"], 3)
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+        module = FeedforwardNN().to(GPU_TYPE)
+        X = np.linspace(-10, 10, 100).reshape(-1, 1).astype(np.float32)
+        input = [
+            torch.from_numpy(X).to(GPU_TYPE),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(counters["inductor"]["activation_quantization_aten_pass"], 4)
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -1994,6 +1994,14 @@ def classify_nodes(joint_module):
                 joint_module, fw_module, bw_module, len(saved_sym_nodes)
             )
     bw_module = reordering_to_mimic_autograd_engine(bw_module)
+    # tag all activation nodes as quantized nodes, we can customized this later
+    for output in fw_module.graph.find_nodes(op="output"):
+        for node in output.args[0]:
+            if node.target in [torch.ops.aten.relu.default, torch.ops.aten.tanh.default]:
+                node.meta["saved_for_quantization"] = True
+    for placeholder in bw_module.graph.find_nodes(op="placeholder"):
+        if any(name in str(placeholder.target) for name in ["relu", "tanh", "sigmoid", "gelu"]):
+            placeholder.meta["saved_for_quantization"] = True
 
     if AOT_PARTITIONER_DEBUG:
         # Calculate sorted sizes of saved values
 
@@ -370,16 +370,22 @@ def _recursive_joint_graph_passes(gm: GraphModule) -> None:
         joint_graph_passes(gm)
 
 
-def _recursive_post_grad_passes(gm: GraphModule, is_inference: bool = False) -> None:
+def _recursive_post_grad_passes(
+    gm: GraphModule,
+    is_inference: bool = False,
+    is_backward: bool = False,
+) -> None:
     with dynamo_timed(
         "_recursive_post_grad_passes",
         log_pt2_compile_event=True,
         dynamo_compile_column_us="post_grad_pass_time_us",
     ):
         for subgraph_name in _get_subgraph_names(gm):
             subgraph = getattr(gm, subgraph_name)
-            _recursive_post_grad_passes(subgraph, is_inference)
-        post_grad_passes(gm, is_inference)
+            _recursive_post_grad_passes(
+                subgraph, is_inference, is_backward)
+            
+        post_grad_passes(gm, is_inference, is_backward)
 
 
 def split_const_gm(
@@ -982,7 +988,7 @@ def log_graph_runnable() -> str:
                 # has some issues with memory in training
                 cuda_context = get_cuda_device_context(gm)
                 with cuda_context:
-                    _recursive_post_grad_passes(gm, is_inference=is_inference)
+                    _recursive_post_grad_passes(gm, is_inference=is_inference, is_backward=is_backward)
                 V.debug.fx_graph_transformed(gm, example_inputs)
                 post_grad_graphs_log.debug(
                     "%s",
 
@@ -72,7 +72,7 @@
 ]
 
 
-def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
+def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool, is_backward: bool):
     """
     Passes that run on after grad.  This is called once on the forwards
     graph and once on the backwards graph.
@@ -132,6 +132,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             if pass_name in POST_GRAD_FUSIONS:
                 continue
             pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]
+            pattern_matcher_pass.is_backward = is_backward
             inductor_before_change = save_inductor_dict(
                 [pattern_matcher_pass.pass_name]
             )
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@`
`72`	`72`	`]`
`73`	`73`
`74`	`74`
`75`		`-def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):`
	`75`	`+def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool, is_backward: bool):`
`76`	`76`	`"""`
`77`	`77`	`Passes that run on after grad. This is called once on the forwards`
`78`	`78`	`graph and once on the backwards graph.`
`@@ -132,6 +132,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):`
`132`	`132`	`if pass_name in POST_GRAD_FUSIONS:`
`133`	`133`	`continue`
`134`	`134`	`pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]`
	`135`	`+ pattern_matcher_pass.is_backward = is_backward`
`135`	`136`	`inductor_before_change = save_inductor_dict(`
`136`	`137`	`[pattern_matcher_pass.pass_name]`
`137`	`138`	`)`