pytorch
diff --git a/‎aten/src/ATen/SavedTensorHooks.cpp‎
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/SavedTensorHooks.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/SavedTensorHooks.h‎
Lines changed: 4 additions & 2 deletions b/‎aten/src/ATen/SavedTensorHooks.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎test/functorch/test_aotdispatch.py‎
Lines changed: 111 additions & 1 deletion b/‎test/functorch/test_aotdispatch.py‎
Lines changed: 111 additions & 1 deletion
diff --git a/‎torch/_C/_autograd.pyi‎
Lines changed: 4 additions & 1 deletion b/‎torch/_C/_autograd.pyi‎
Lines changed: 4 additions & 1 deletion
@@ -26,9 +26,9 @@ bool SavedTensorDefaultHooks::is_enabled() {
   return !tls.disabled_error_message.has_value();
 }
 
-void SavedTensorDefaultHooks::disable(const std::string& message) {
+void SavedTensorDefaultHooks::disable(const std::string& message, const bool fail_if_non_empty) {
   tls.disabled_error_message = message;
-  if (!tls.stack.empty()) {
+  if (fail_if_non_empty && !tls.stack.empty()) {
     assertSavedTensorHooksNotDisabled();
   }
 }
@@ -72,9 +72,9 @@ std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {
   return hooks;
 }
 
-std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {
+std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks(bool ignore_is_tracing) {
   // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
-  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
+  if (!is_initialized || tls.stack.empty() || (!ignore_is_tracing && tls.is_tracing)) {
     return std::nullopt;
   }
   return tls.stack.top();
 
@@ -36,7 +36,7 @@ struct TORCH_API SavedTensorDefaultHooks {
       c10::SafePyObject unpack_hook);
   static std::pair<c10::SafePyObject, c10::SafePyObject> pop_hooks();
   static std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
-  get_hooks();
+  get_hooks(bool ignore_is_tracing = false);
   static void lazy_initialize();
 
   static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
@@ -48,7 +48,9 @@ struct TORCH_API SavedTensorDefaultHooks {
   // disabled, then the following will raise an error:
   // - Attempting to push_hooks
   // - calling disable(message) with a non-zero stack (hooks) size
-  static void disable(const std::string& error_message);
+  static void disable(
+      const std::string& error_message,
+      const bool fail_if_non_empty = true);
   static void enable();
   static bool is_enabled();
   static const std::optional<std::string>& get_disabled_error_message();
 
@@ -10,7 +10,7 @@
 import itertools
 import unittest
 import warnings
-from contextlib import ContextDecorator, nullcontext
+from contextlib import ContextDecorator, ExitStack, nullcontext
 from functools import partial, wraps
 from typing import Any, Callable, Optional, Union
 from unittest.mock import patch
@@ -6601,6 +6601,116 @@ def _inp():
             self.assertEqual(1, len(ctx.tangent_strides))
             self.assertEqual((128, 4, 16, 1), ctx.tangent_strides[0])
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    def test_saved_tensors_hooks(self):
+        def _test_pack_hooks(fn, inp_fn, hooks):
+            # TODO XXX: Add Dynamo ID_MATCH guards on hooks
+            torch._dynamo.reset()
+            with ExitStack() as stack:
+                for hook in hooks:
+                    pack, unpack = hook
+                    stack.enter_context(
+                        torch.autograd.graph.saved_tensors_hooks(pack, unpack)
+                    )
+                ref_x = inp_fn()
+                x = ref_x.detach().clone().requires_grad_()
+
+                ref_y = fn(ref_x)
+                ref_y.sum().backward()
+
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                y = torch.compile(fn, backend="aot_eager", fullgraph=True)(x)
+                y.sum().backward()
+                self.assertEqual(ref_y, y, atol=1e-2, rtol=1e-2)
+                self.assertEqual(ref_x.grad, x.grad, atol=1e-2, rtol=1e-2)
+
+        class SAF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                return gx + saved_x
+
+        class AF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                ctx.d1 = x.size(1)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                d1 = ctx.d1
+                return gx + saved_x * d1
+
+        def fn(x):
+            x = x.relu()
+            x = x + 1
+            x = 2 * x
+            x = AF.apply(x)
+            return x
+
+        def simple_fn(x):
+            x = x + 1
+            x = SAF.apply(x)
+            return x
+
+        device = torch.device("cuda:0")
+
+        def inp_fn():
+            return torch.ones(2, 3, device=device, requires_grad=True)
+
+        def pack_dev_sym_cpu(x):
+            return (x.device, x.size(0), 10 * x.cpu())
+
+        def unpack_dev_sym_cpu(packed):
+            device, dim0, tensor = packed
+            ret = tensor.to(device=device) * dim0
+            return ret
+
+        def pack_tensor(x):
+            return x.cpu()
+
+        def unpack_tensor(packed):
+            t_cpu = packed
+            return t_cpu.to(device=device)
+
+        def pack_bf16(x):
+            return x.to(dtype=torch.bfloat16)
+
+        def unpack_bf16(x):
+            return x.to(dtype=torch.float)
+
+        def pack_mul2(x):
+            return x * 2
+
+        def unpack_mul2(x):
+            return x / 2
+
+        def pack_float8(x):
+            return (x.dtype, x.to(torch.float8_e4m3fn))
+
+        def unpack_float8(packed):
+            dtype, tensor = packed
+            return tensor.to(dtype)
+
+        for test_fn in [simple_fn, fn]:
+            _test_pack_hooks(test_fn, inp_fn, [(pack_bf16, unpack_bf16)])
+            _test_pack_hooks(test_fn, inp_fn, [(pack_mul2, unpack_mul2)])
+            _test_pack_hooks(
+                test_fn, inp_fn, [(pack_mul2, unpack_mul2), (pack_bf16, unpack_bf16)]
+            )
+            _test_pack_hooks(test_fn, inp_fn, [(pack_float8, unpack_float8)])
+            _test_pack_hooks(test_fn, inp_fn, [(pack_tensor, unpack_tensor)])
+            _test_pack_hooks(test_fn, inp_fn, [(pack_dev_sym_cpu, unpack_dev_sym_cpu)])
+            # TODO XXX: Test packing/unpacking to subclasses
+
 
 # entries in here don't work and need to be fixed.
 # Each one of these is a bug (or needs to be investigated)
 
@@ -116,14 +116,17 @@ def _push_saved_tensors_default_hooks(
     unpack_hook: Callable[[Any], torch.Tensor],
 ) -> None: ...
 def _pop_saved_tensors_default_hooks() -> None: ...
+def _top_saved_tensors_default_hooks(
+    ignore_is_tracing: bool,
+) -> tuple[Callable[[torch.Tensor], Any], Callable[[Any], torch.Tensor]]: ...
 def _unsafe_set_version_counter(
     t: tuple[torch.Tensor, ...], prev_version: tuple[int, ...]
 ) -> None: ...
 def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
 def _disable_profiler_legacy() -> list[list[ProfilerEvent]]: ...
 def _profiler_type() -> ActiveProfilerType: ...
 def _saved_tensors_hooks_enable() -> None: ...
-def _saved_tensors_hooks_disable(message: str) -> None: ...
+def _saved_tensors_hooks_disable(message: str, fail_if_non_empty=True) -> None: ...
 def _saved_tensors_hooks_get_disabled_error_message() -> str | None: ...
 def _saved_tensors_hooks_set_tracing(is_tracing: bool) -> bool: ...
Original file line number	Diff line number	Diff line change
`@@ -26,9 +26,9 @@ bool SavedTensorDefaultHooks::is_enabled() {`
`26`	`26`	`return !tls.disabled_error_message.has_value();`
`27`	`27`	`}`
`28`	`28`
`29`		`-void SavedTensorDefaultHooks::disable(const std::string& message) {`
	`29`	`+void SavedTensorDefaultHooks::disable(const std::string& message, const bool fail_if_non_empty) {`
`30`	`30`	`tls.disabled_error_message = message;`
`31`		`- if (!tls.stack.empty()) {`
	`31`	`+ if (fail_if_non_empty && !tls.stack.empty()) {`
`32`	`32`	`assertSavedTensorHooksNotDisabled();`
`33`	`33`	`}`
`34`	`34`	`}`
`@@ -72,9 +72,9 @@ std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {`
`72`	`72`	`return hooks;`
`73`	`73`	`}`
`74`	`74`
`75`		`-std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {`
	`75`	`+std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks(bool ignore_is_tracing) {`
`76`	`76`	`// For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]`
`77`		`- if (!is_initialized \|\| tls.stack.empty() \|\| tls.is_tracing) {`
	`77`	`+ if (!is_initialized \|\| tls.stack.empty() \|\| (!ignore_is_tracing && tls.is_tracing)) {`
`78`	`78`	`return std::nullopt;`
`79`	`79`	`}`
`80`	`80`	`return tls.stack.top();`