pytorch
diff --git a/‎aten/src/ATen/SavedTensorHooks.cpp‎
Lines changed: 14 additions & 6 deletions b/‎aten/src/ATen/SavedTensorHooks.cpp‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎aten/src/ATen/SavedTensorHooks.h‎
Lines changed: 4 additions & 2 deletions b/‎aten/src/ATen/SavedTensorHooks.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎test/functorch/test_aotdispatch.py‎
Lines changed: 122 additions & 1 deletion b/‎test/functorch/test_aotdispatch.py‎
Lines changed: 122 additions & 1 deletion
diff --git a/‎torch/_dynamo/variables/builder.py‎
Lines changed: 5 additions & 1 deletion b/‎torch/_dynamo/variables/builder.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py‎
Lines changed: 152 additions & 3 deletions b/‎torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py‎
Lines changed: 152 additions & 3 deletions
@@ -62,22 +62,30 @@ void SavedTensorDefaultHooks::lazy_initialize() {
 void SavedTensorDefaultHooks::push_hooks(SafePyObject pack_hook, SafePyObject unpack_hook) {
   TORCH_INTERNAL_ASSERT(is_initialized);
   assertSavedTensorHooksNotDisabled();
-  tls.stack.emplace(std::move(pack_hook), std::move(unpack_hook));
+  tls.stack.emplace_back(std::move(pack_hook), std::move(unpack_hook));
 }
 
 std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {
   TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
-  std::pair<SafePyObject, SafePyObject> hooks = std::move(tls.stack.top());
-  tls.stack.pop();
+  std::pair<SafePyObject, SafePyObject> hooks = std::move(tls.stack.back());
+  tls.stack.pop_back();
   return hooks;
 }
 
-std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {
+std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks(bool ignore_is_tracing) {
   // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
-  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
+  if (!is_initialized || tls.stack.empty() || (!ignore_is_tracing && tls.is_tracing)) {
     return std::nullopt;
   }
-  return tls.stack.top();
+  return tls.stack.back();
+}
+
+std::optional<std::vector<std::pair<SafePyObject, SafePyObject>>>
+SavedTensorDefaultHooks::get_all_hooks(bool ignore_is_tracing) {
+  if (!is_initialized || tls.stack.empty() || (!ignore_is_tracing && tls.is_tracing)) {
+    return std::nullopt;
+  }
+  return tls.stack;
 }
 
 }
@@ -15,7 +15,7 @@ namespace impl {
 
 struct TORCH_API SavedTensorDefaultHooksTLS {
   // PyObject is defined in c10/util/python_stub.h
-  std::stack<std::pair<c10::SafePyObject, c10::SafePyObject>> stack;
+  std::vector<std::pair<c10::SafePyObject, c10::SafePyObject>> stack;
 
   // See NOTE: [Disabling SavedTensorDefaultHooks] for context
   // NOTE: [disabled_error_message invariant]
@@ -36,7 +36,9 @@ struct TORCH_API SavedTensorDefaultHooks {
       c10::SafePyObject unpack_hook);
   static std::pair<c10::SafePyObject, c10::SafePyObject> pop_hooks();
   static std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
-  get_hooks();
+  get_hooks(bool ignore_is_tracing = false);
+  static std::optional<std::vector<std::pair<SafePyObject, SafePyObject>>>
+  get_all_hooks(bool ignore_is_tracing = false);
   static void lazy_initialize();
 
   static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
 
@@ -10,7 +10,7 @@
 import itertools
 import unittest
 import warnings
-from contextlib import ContextDecorator, nullcontext
+from contextlib import ContextDecorator, nullcontext, ExitStack
 from functools import partial, wraps
 from typing import Any, Callable, Optional, Union
 from unittest.mock import patch
@@ -6601,6 +6601,127 @@ def _inp():
             self.assertEqual(1, len(ctx.tangent_strides))
             self.assertEqual((128, 4, 16, 1), ctx.tangent_strides[0])
 
+    def test_saved_tensors_hooks(self):
+        def _test_pack_hooks(fn, inp_fn, hooks):
+            torch._dynamo.reset()
+            with ExitStack() as stack:
+                for hook in hooks:
+                    pack, unpack = hook
+                    stack.enter_context(torch.autograd.graph.saved_tensors_hooks(pack, unpack))
+                ref_x = inp_fn()
+                x = ref_x.detach().clone().requires_grad_()
+
+                print(f"XXX EAGER BEGIN")
+                ref_y = fn(ref_x)
+                ref_y.sum().backward()
+                print(f"XXX EAGER END")
+
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                y = torch.compile(fn, backend="aot_eager", fullgraph=True)(x)
+                y.sum().backward()
+                self.assertEqual(ref_y, y, atol=1e-2, rtol=1e-2)
+                print(f"XXX REF_X.GRAD:{ref_x.grad}")
+                print(f"XXX X.GRAD:{x.grad}")
+                self.assertEqual(ref_x.grad, x.grad, atol=1e-2, rtol=1e-2)
+
+        from torch.utils._traceback import CapturedTraceback
+        def _print_traceback():
+            print("".join(CapturedTraceback.extract(cpp=True).format()))
+
+        class SAF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                return gx + saved_x
+
+        class AF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                ctx.d1 = x.size(1)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                d1 = ctx.d1
+                return gx + saved_x * d1
+
+        def fn(x):
+            x = x.relu()
+            x = x + 1
+            x = 2 * x
+            x = AF.apply(x)
+            return x
+
+        def simple_fn(x):
+            x = x + 1
+            x = SAF.apply(x)
+            return x
+        device=torch.device("cuda:0")
+
+        def inp_fn():
+            return torch.ones(2, 3, device=device, requires_grad=True)
+
+        def pack_dev_sym_cpu(x):
+            return (x.device, x.size(0), x.cpu())
+
+        def unpack_dev_sym_cpu(packed):
+            device, dim0, tensor = packed
+            ret = tensor.to(device=device) * dim0
+            return ret
+
+        def pack_tensor(x):
+            return x.cpu()
+
+        def unpack_tensor(packed):
+            t_cpu = packed
+            return t_cpu.to(device=device)
+
+        def pack_bf16(x):
+            print(f"XXX PACK_BF16")
+            return x.to(dtype=torch.bfloat16)
+
+        def unpack_bf16(x):
+            print(f"XXX UNPACK_BF16")
+            return x.to(dtype=torch.float)
+
+        def pack_mul2(x):
+            print(f"XXX PACK_MUL2")
+            return x * 2
+
+        def unpack_mul2(x):
+            print(f"XXX UNPACK_MUL2")
+            return x / 2
+
+        def pack_two_tensor(x):
+            return TwoTensor(x, x)
+
+        def unpack_two_tensor(x):
+            return x.a
+
+        for test_fn in [simple_fn]:
+            # print("XXX 0")
+            # _test_pack_hooks(test_fn, inp_fn, [(pack_bf16, unpack_bf16)])
+            # print("XXX 1")
+            # _test_pack_hooks(test_fn, inp_fn, [(pack_mul2, unpack_mul2)])
+            # print("XXX 2")
+            # _test_pack_hooks(test_fn, inp_fn, [(pack_mul2, unpack_mul2), (pack_bf16, unpack_bf16)])
+            print("XXX 3")
+            _test_pack_hooks(test_fn, inp_fn, [(pack_dev_sym_cpu, unpack_dev_sym_cpu)])
+            # print("XXX 4")
+            # _test_pack_hooks(test_fn, inp_fn, [(pack_tensor, unpack_tensor)])
+            # print("XXX 5")
+            # _test_pack_hooks(test_fn, inp_fn, [(pack_two_tensor, unpack_two_tensor)])
+            # TODO XXX: Add packing to subclasses
+
+
 
 # entries in here don't work and need to be fixed.
 # Each one of these is a bug (or needs to be investigated)
 
@@ -2458,7 +2458,11 @@ def _wrap_fx_proxy(
     assert "example_value" not in proxy.node.meta, f"{proxy.node.meta['example_value']}"
 
     # See NOTE: [Deferring tensor pack/unpack hooks until runtime]
-    with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+    import contextlib
+
+    with (
+        contextlib.nullcontext()
+    ):  # torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
         # with preserve_rng_state():
         # only allow_non_graph_fake in this instance because we handle the non-fake
         # cases properly below.
 
@@ -21,6 +21,7 @@
 from typing import Any, Callable, Optional, TYPE_CHECKING
 
 import torch
+import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch import Tensor
 from torch._dynamo.utils import detect_fake_mode, lazy_format_graph_code
@@ -29,12 +30,13 @@
 from torch._subclasses import FakeTensor
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx.experimental._backward_state import BackwardState
-from torch.fx.experimental.proxy_tensor import is_sym_node
+from torch.fx.experimental.proxy_tensor import is_sym_node, make_fx
 from torch.fx.experimental.symbolic_shapes import fx_placeholder_vals
 from torch.fx.graph_module import GraphModule
 from torch.fx.passes._tensorify_python_scalars import tensorify_python_scalars
 from torch.multiprocessing.reductions import StorageWeakRef
 from torchgen.utils import dataclass_repr
+from torch.types import py_sym_types
 
 from .. import config
 from .autograd_cache import (
@@ -877,13 +879,160 @@ def aot_dispatch_autograd(
             # we only need to bookkeep the symints that are saved for bw, not any symints
             # the user forward might have returned in its own output
             fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
-            num_fw_outs_saved_for_bw = len(fw_outs_saved_for_bw)
+            num_saved_for_bw = len(fw_outs_saved_for_bw)
             symint_outs_saved_for_bw = [
                 n for n in fw_outs_saved_for_bw if is_sym_node(n)
             ]
+            num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            num_saved_tensors = len(fw_outs_saved_for_bw) - num_symints_saved_for_bw
+
+            # TODO XXX: Note about handling saved_tensors_hooks
+            #
+            hooks = torch._C._autograd._top_saved_tensors_default_hooks(True)
+
+            print(f"XXX JIT_COMP_RUNTIME_WRAP hooks:{hooks}")
+            # TODO XXX: Set compilation guards on hooks py objects, to recompile if previous hooks changed
+            if hooks:
+                # TODO: XXX Support stacked hooks
+                pack_hook, unpack_hook = hooks
+                assert pack_hook and unpack_hook
+                fw_g = fw_module.graph
+                bw_g = bw_module.graph
+                bw_g_inputs = bw_g.find_nodes(op="placeholder")
+                print(f"XXX FW_GRAPH BEFORE:{fw_g}")
+                print(f"XXX BW_GRAPH BEFORE:{bw_g}")
+
+                fw_out_n = fw_g.output_node()
+                fw_out_args = list(fw_out_n.args[0])
+                fw_outs_insert_tensors = []
+                fw_outs_insert_non_tensors = []
+                for saved in fw_outs_saved_for_bw:
+                    val = saved.meta["val"]
+                    if isinstance(val, torch.Tensor):
+                        pack_gm = make_fx(pack_hook)(val)
+                        pack_g = pack_gm.graph
+                        print(f"XXX PACK_GRAPH:{pack_g}")
+                        pack_out_val = pack_gm(val)
+                        # Install pack_g as eiplogue of fw_module and replace saved outputs with pack_g outputs
+                        pack_g_inputs = pack_g.find_nodes(op="placeholder")
+                        assert len(pack_g_inputs) == 1
+                        env = {pack_g_inputs[0]: saved}
+                        with fw_g.inserting_after(saved):
+                            new_out_n = None
+                            for node in pack_g.nodes:
+                                if node.op == "placeholder":
+                                    continue
+                                new_n = fw_g.node_copy(node, lambda n: env[n])
+                                env[node] = new_n
+                                if node.op == "output":
+                                    new_out_n = new_n
+
+                        assert new_out_n
+                        for n in pytree.tree_leaves(new_out_n.args[0]):
+                            if not isinstance(n, torch.fx.Node):
+                                continue
+
+                            out_val = n.meta["val"]
+                            if isinstance(out_val, torch.Tensor):
+                                fw_outs_insert_tensors.append(n)
+                            elif is_sym_node(n):
+                                fw_outs_insert_non_tensors.append(n)
+
+                        fw_g.erase_node(new_out_n)
+
+                        # Install unpack_g as prologue of bw_module
+                        unpack_gm = make_fx(unpack_hook)(pack_out_val)
+                        unpack_out_val = unpack_gm(pack_out_val)
+                        unpack_g = unpack_gm.graph
+                        print(f"XXX PACK_OUT_VAL:{pack_out_val}")
+                        print(f"XXX UNPACK_OUT_VAL:{unpack_out_val}")
+                        print(f"XXX UNPACK_GRAPH:{unpack_g}")
+
+
+                        def find_saved_in_bw_inputs(bw_inputs):
+                            for n in bw_inputs:
+                                # TODO: XXX Recheck validity of this identificaiton :)
+                                if n.name == saved.name:
+                                    return n
+
+                        bw_g_input = find_saved_in_bw_inputs(bw_g_inputs)
+                        assert bw_g_input
+                        # Replace bw_g input with copy of output of pack
+
+                        unpack_g_inputs = unpack_g.find_nodes(op="placeholder")
+                        env = {}
+                        # Adding unpack inputs to bw graph instead of saved
+                        for unp_in_n, val in zip(
+                            unpack_g_inputs, pytree.tree_leaves(pack_out_val)
+                        ):
+                            is_sym = isinstance(val, py_sym_types)
+                            if isinstance(val, torch.Tensor) or is_sym:
+                                new_node_name = bw_g_input.name + "_" + unp_in_n.name
+                                # Backward calling convention: ctx_symints...ctx_saved_tensors...
+                                if is_sym:
+                                    with bw_g.inserting_before(bw_g_inputs[0]):
+                                        new_n = bw_g.placeholder(new_node_name)
+                                else:
+                                    with bw_g.inserting_before(bw_g_inputs[num_saved_for_bw]):
+                                        new_n = bw_g.placeholder(new_node_name)
+                                new_n.meta["val"] = val
+                                env[unp_in_n] = new_n
+                            else:
+                                # Inline values of non-Tensor, non-SymScalars
+                                env[unp_in_n] = val
+
+                        new_out_n = None
+                        with bw_g.inserting_before(bw_g_inputs[-1]):
+                            for node in unpack_g.nodes:
+                                if node.op == "placeholder":
+                                    continue
+                                new_n = bw_g.node_copy(node, lambda n: env[n])
+                                env[node] = new_n
+                                if node.op == "output":
+                                    new_out_n = new_n
+
+                        # TODO XXX: Debug why unpack graph produces [node] instead of node
+                        # For unpack function
+                        # def unpack_dev_sym_cpu(packed):
+                        #   device, dim0, tensor = packed
+                        #   return tensor.to(device=device) * dim0
+                        #
+                        # assert len(new_out_n.args) == 1
+                        # print(f"XXX NEW_OUT_N.ARGS:{new_out_n.args}")
+                        # unpack_saved_tensor_n = new_out_n.args[0]
+                        unpack_saved_tensor_n = pytree.tree_leaves(new_out_n.args)[0]
+
+                        bw_g_input.replace_all_uses_with(unpack_saved_tensor_n)
+                        bw_g.erase_node(new_out_n)
+                        bw_g.erase_node(bw_g_input)
+                fw_out_n.args = (
+                    tuple(
+                        pytree.tree_leaves((
+                            fw_outs[:num_inner_fwd_outputs],
+                            fw_outs_insert_tensors,
+                            fw_outs_insert_non_tensors,
+                            symint_outs_saved_for_bw,
+                        ))
+                    ),
+                )
+                print(f"\nXXX FW_GRAPH AFTER:{fw_g}")
+                print(f"\nXXX BW_GRAPH AFTER:{bw_g}")
+                fw_g.lint()
+                bw_g.lint()
+                # TODO: Refactor compute below to deduplicate
+                fw_outs = next(iter(fw_module.graph.find_nodes(op="output"))).args[0]
+                fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+                num_fw_outs_saved_for_bw = len(fw_outs_saved_for_bw)
+                num_saved_for_bw = len(fw_outs_saved_for_bw)
+                symint_outs_saved_for_bw = [
+                    n for n in fw_outs_saved_for_bw if is_sym_node(n)
+                ]
+                num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+                num_saved_tensors = len(fw_outs_saved_for_bw) - num_symints_saved_for_bw
+
             fw_metadata.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
             inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
-            num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+
 
             if torch._functorch.config.donated_buffer:
                 fw_metadata.bw_donated_idxs = collect_bw_donated_buffer_idxs(