NVIDIA
diff --git a/‎apex/contrib/torchsched/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎apex/contrib/torchsched/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎apex/contrib/torchsched/backend.py‎
Lines changed: 42 additions & 20 deletions b/‎apex/contrib/torchsched/backend.py‎
Lines changed: 42 additions & 20 deletions
diff --git a/‎apex/contrib/torchsched/config.py‎
Lines changed: 52 additions & 0 deletions b/‎apex/contrib/torchsched/config.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎apex/contrib/torchsched/inductor/_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎apex/contrib/torchsched/inductor/_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apex/contrib/torchsched/inductor/event.py‎
Lines changed: 33 additions & 10 deletions b/‎apex/contrib/torchsched/inductor/event.py‎
Lines changed: 33 additions & 10 deletions
@@ -13,8 +13,8 @@
 from .backend import get_backend
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from typing import Any
-    from typing import Callable
 
     from torch._ops import OpOverload
 
@@ -50,7 +50,7 @@ def set_default_backend(backend: str) -> None:
     Parameters:
         backend (str): The backend to use as the default for torch.compile.
     """
-    global _SUPPORTED_BACKENDS, _DEFAULT_BACKEND
+    global _DEFAULT_BACKEND
     assert backend in _SUPPORTED_BACKENDS, f"Unknown backend {backend}"
     _DEFAULT_BACKEND = backend
 
 
@@ -5,27 +5,23 @@
 import functools
 from copy import copy
 from typing import TYPE_CHECKING
-from typing import Callable
 from typing import ParamSpec
 from typing import TypeVar
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from types import NotImplementedType
 
 import torch
 from torch import Tensor
 from torch import _TorchCompileInductorWrapper
 from torch._dynamo import lookup_backend
-from torch._inductor.codegen.common import register_backend_for_device
-from torch._inductor.codegen.cuda_combined_scheduling import CUDACombinedScheduling
-from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.compile_fx import compile_fx
 from torch._inductor.compile_fx import compile_fx_inner
 from torch._inductor.decomposition import select_decomp_table
 
 import apex.contrib.torchsched.config as config
 from apex.contrib.torchsched.inductor import patch_graph_lowering
-from apex.contrib.torchsched.inductor.wrapper import MultiStreamWrapperCodegen
 from apex.contrib.torchsched.passes import pre_grad_custom_pass
 
 aten = torch.ops.aten
@@ -43,21 +39,9 @@ def enable_multi_stream_scheduling(compile_fn: Callable[P, R]) -> Callable[P, R]
 
     @functools.wraps(compile_fn)
     def _compile_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
-        register_backend_for_device("cuda", CUDACombinedScheduling, MultiStreamWrapperCodegen)
         patch_graph_lowering(patch=True)
-
-        # torch.compile explicitly calls `write_get_raw_stream` via wrapper's class method in its
-        # lowering process to walk around the wrapper-stream LRU cache mechanism. To be compatible
-        # with this, we got to patch wrapper's class method as well.
-        _origin_write_get_raw_stream = PythonWrapperCodegen.write_get_raw_stream
-        PythonWrapperCodegen.write_get_raw_stream = MultiStreamWrapperCodegen._write_get_raw_stream
-
         compile_results = compile_fn(*args, **kwargs)
-
-        register_backend_for_device("cuda", CUDACombinedScheduling, PythonWrapperCodegen)
         patch_graph_lowering(patch=False)
-        PythonWrapperCodegen.write_get_raw_stream = _origin_write_get_raw_stream
-
         return compile_results
 
     return _compile_wrapper
@@ -302,11 +286,49 @@ def get_backend(
     if backend == "torch":
         return lookup_backend("inductor")
 
+    # [NOTE] Disable buffer reuse and inplace buffers to avoid inter-stream conflicts.
+    #
+    # In PyTorch Inductor, the safety of buffer reuse and in-place buffer update is ensured by the
+    # program's single-stream, serial execution. That is, if op2 is launched only after op1 has
+    # completed execution, then these cases are safe:
+    #
+    #   Case 1: Safe to reuse buffer `workspace1` as `op2`'s workspace.
+    #
+    #         op1   ->   op2              op1   ->   op2
+    #          ↕          ↕       ⇒        ↕          ↑
+    #     workspace1 workspace2       workspace1 ←----┘
+    #
+    #   Case 2: Safe to inpalace `op1`'s output to `buf1` then send to `op2` as input.
+    #
+    #     buf1 -> op1 -> buf2 -> op2  ⇒  buf1 ↔	op1
+    #                                     └-------> op2
+    #
+    # However, if operators are dispatched to distinct CUDA Streams and execute in parallel, above
+    # cases are not safe any more:
+    #
+    #   Counter example 1: Case 1 is not safe if op1 and op2 are in parallel.
+    #
+    #        op1
+    #         ↕
+    #     workspace1 (Buffer modified concurrently by op1 and op2.)
+    #         ↕
+    #        op2
+    #
+    #   Counter example 2: Case 2 is not safe if op1 and op2 are in parallel.
+    #
+    #     buf1 <-->	op1
+    #      └------> op2 (Op2 could read op1's input data.)
+    #
+    # Thus currently we disable both buffer reuse and inplace buffer update to ensure multi-stream
+    # correctness.
+    #
+    # TODO(@davidli): Add cross-stream dependency to Inductor scheduling's dependency system so we
+    # can safely reuse and inplace update buffers even in multi-stream scenario.
+
     if scheme == "dwb":
         return DecompositionsWrapper(
             mode="default",
-            # TODO(@davidli): Elegantly solve cross-stream buffer reusing conflicts.
-            options={"allow_buffer_reuse": False},
+            options={"allow_buffer_reuse": False, "inplace_buffers": False},
             dynamic=False,
             decompositions={
                 aten.convolution_backward.default: convolution_backward_decomp_dwb,
@@ -315,7 +337,7 @@ def get_backend(
     elif scheme == "wbd":
         return DecompositionsWrapper(
             mode="default",
-            options={"allow_buffer_reuse": False},
+            options={"allow_buffer_reuse": False, "inplace_buffers": False},
             dynamic=False,
             decompositions={
                 aten.convolution_backward.default: convolution_backward_decomp_wbd,
 
@@ -1,6 +1,8 @@
 """Configurations for graph scheduler."""
 
+import functools
 import os
+import re
 import sys
 
 # Debug info and dump grpahs
@@ -17,6 +19,56 @@
 # scheduled to other streams in a round-robin way.
 num_streams = int(os.getenv("TORCH_SCHED_NUM_STREAMS", "8"))
 
+
+def _get_skip_post_grad_graph_ids() -> set[int]:
+    if ids := os.environ.get("TORCH_SCHED_SKIP_GRAPH_IDS"):
+        result: set[int] = set()
+        for part in ids.split(","):
+            if "-" in part:
+                start, end = map(int, part.split("-"))
+                result.update(range(start, end + 1))
+            else:
+                result.add(int(part))
+        return result
+    else:
+        return set()
+
+
+# IDs of post AOT-autograd graphs that should be skipped for multi-stream scheduling. Can be
+# specified via TORCH_SCHED_SKIP_GRAPH_IDS environment variable in a SLURM-like scheme, e.g.,
+# TORCH_SCHED_SKIP_GRAPH_IDS=1,2,3-5,7-10
+skip_post_grad_graph_ids: set[int] = _get_skip_post_grad_graph_ids()
+
+# Reduce the number of allocated CUDA Events in the generated program by:
+# 1. Track reference count of each CUDA Event in the scheduling phase. Skip generating CUDA Events
+#    that have no reference counts, i.e., have not been waited by other streams;
+# 2. Reuse allocated CUDA Events when feasible.
+# This option is enable by default.
+reuse_cuda_event: bool = os.getenv("TORCH_SCHED_REUSE_CUDA_EVENT", "1") == "1"
+
+
+@functools.lru_cache
+def __get_dump_code_backends_and_dir(dump_code: str | None) -> tuple[list[str], str | None]:
+    pattern = r"(?:\+(?P<backend>\w+),)?(?P<dir>[\w\/\.\-\s@#~]+)"
+    backends, dir = ["torchsched"], None
+    if dump_code and (match := re.match(pattern, dump_code)):
+        if backend := match.group("backend"):
+            backends.append(backend)
+        dir = os.path.abspath(match.group("dir"))
+    return backends, dir
+
+
+# Specify dump code backend types and output directory by::
+#
+#   TORCH_SCHED_DUMP_CODE='+inductor,/dir/to/save/code'
+#
+# Where `+inductor` enables dump both Inductor and torchsched code. If omitted, only dump
+# torchsched code. `/dir/to/save/code` specifies a directory to dump code to.
+(
+    dump_code_backends,
+    dump_code_dir,
+) = __get_dump_code_backends_and_dir(os.getenv("TORCH_SCHED_DUMP_CODE"))
+
 from torch.utils._config_module import install_config_module  # noqa: E402
 
 # adds patch, save_config, etc
 
@@ -26,7 +26,7 @@
 STREAM_NAME_TEMPLATE: str = "stream{stream_idx:d}"
 
 
-@functools.lru_cache()
+@functools.lru_cache
 def get_stream_name(stream_idx: int) -> str:
     """Generate CUDA Stream name from stream index number.
 
 
@@ -18,6 +18,8 @@
 from torch._inductor.codegen.wrapper import IndentedBuffer
 from torch._inductor.codegen.wrapper import WrapperLine
 
+import apex.contrib.torchsched.config as torchsched_config
+from apex.contrib.torchsched.inductor._utils import DEFAULT_STREAM_IDX
 from apex.contrib.torchsched.inductor._utils import ENTRANCE_EVENT
 from apex.contrib.torchsched.inductor._utils import EVENT_NAME_TEMPLATE
 from apex.contrib.torchsched.inductor._utils import get_stream_name
@@ -31,6 +33,7 @@ class CudaEventSym:
     Args:
         factory: The CUDAEventFactory that generate this event.
         idx: Indexing number assigned in chronological order during scheduling.
+        originate_stream_idx: The index of the CUDA stream that this event originated from.
         ref_count: Reference count of this event instance.
         materialized_event: The actual CUDA Event name that will be used in the final PyTorch
             program. Only symbolic event with reference count larger than one will be materialized.
@@ -42,22 +45,30 @@ class CudaEventSym:
 
     factory: CudaEventFactory
     idx: int
+    originate_stream_idx: int
     ref_count: int = 0
     materialized_event: str | None = None
 
     def __lt__(self, rhs: CudaEventSym) -> bool:
         """Whether the current event is generated before the rhs event."""
-        return self.idx < rhs.idx and self.factory is rhs.factory
+        if self.factory is not rhs.factory:
+            return NotImplemented
+        return (self.idx, self.originate_stream_idx) < (rhs.idx, rhs.originate_stream_idx)
 
     def __eq__(self, rhs: object) -> bool:
         """Whether the current event is identical to the rhs event."""
         if not isinstance(rhs, CudaEventSym):
             return NotImplemented
-        return self.idx == rhs.idx and self.factory is rhs.factory
+        return (
+            self.idx == rhs.idx
+            and self.originate_stream_idx == rhs.originate_stream_idx
+            and self.factory is rhs.factory
+        )
 
     def __str__(self) -> str:
         """Represent this symbolic event in string."""
         ret = f"{self.__class__.__name__} (idx={self.idx}"
+        ret += f", originate_stream_idx={self.originate_stream_idx}"
         if self.ref_count:
             ret += f", ref_count={self.ref_count}"
         if self.materialized_event:
@@ -67,7 +78,7 @@ def __str__(self) -> str:
 
     def __hash__(self) -> int:
         """Hash this symbolic event."""
-        return hash(f"{id(self.factory)=},{self.idx=}")
+        return hash((id(self.factory), self.idx, self.originate_stream_idx))
 
     def record(self, stream_idx: int) -> _CudaEventRecordLine:
         """Record this event on a given stream.
@@ -103,6 +114,7 @@ def wait(self, stream_idx: int) -> _CudaEventWaitLine:
             the reference count of this event. If an event object has called this method, it is
             guaranteed to be generated in the final program.
         """
+        assert stream_idx != self.originate_stream_idx
         self.ref_count += 1
         stream = get_stream_name(stream_idx)
         return _CudaEventWaitLine(self, stream)
@@ -113,11 +125,12 @@ class _CudaEventRecordLine(WrapperLine):
 
     event: CudaEventSym
     stream: str
+    _reuse_cuda_event: bool = torchsched_config.reuse_cuda_event
 
     def codegen(self, code: IndentedBuffer) -> None:
         assert 0 <= self.event.ref_count
         assert self.event.materialized_event is None
-        if self.event.ref_count:
+        if self.event.ref_count or not self._reuse_cuda_event:
             self.event.materialized_event = self.event.factory.get_materialized_event(code)
             code.writeline(f"{self.event.materialized_event}.record({self.stream})")
 
@@ -131,12 +144,13 @@ class _CudaEventWaitLine(WrapperLine):
     def codegen(self, code: IndentedBuffer) -> None:
         assert 0 < self.event.ref_count
         assert self.event.materialized_event is not None
-        code.writeline(f"{self.event.materialized_event}.wait({self.stream})")
+        code_line = f"{self.event.materialized_event}.wait({self.stream})"
         self.event.ref_count -= 1
         if self.event.ref_count == 0:
             self.event.factory.deposit_materialized_event(self.event.materialized_event)
             self.event.materialized_event = None
-            code.writeline(f"# End lifecycle of {self.event}")
+            code_line += f"  # End lifecycle of {self.event}"
+        code.writeline(code_line)
 
 
 class CudaEventFactory:
@@ -153,23 +167,32 @@ def __init__(self) -> None:
         self.materialized_event_idx: itertools.count = itertools.count(start=1)
         self.available_materialized_events: set[str] = set()
         self._entrance_event: CudaEventSym | None = None
+        self._reuse_cuda_event: bool = torchsched_config.reuse_cuda_event
 
     def get_entrance_event(self) -> CudaEventSym:
         """Return the cuda event that corresponding to compute graph entering."""
         if self._entrance_event is None:
-            self._entrance_event = CudaEventSym(factory=self, idx=0)
+            self._entrance_event = CudaEventSym(
+                factory=self,
+                idx=0,
+                originate_stream_idx=DEFAULT_STREAM_IDX,
+            )
             # Code-gen for entrance event is almost hard-coded in device guard enter so the
             # materialization is slightly different here.
             self._entrance_event.materialized_event = ENTRANCE_EVENT
         return self._entrance_event
 
-    def get_sym_event(self) -> CudaEventSym:
+    def get_sym_event(self, originate_stream_idx: int) -> CudaEventSym:
         """Allocate a symbolic cuda event."""
-        return CudaEventSym(factory=self, idx=next(self.symbolic_event_idx))
+        return CudaEventSym(
+            factory=self,
+            idx=next(self.symbolic_event_idx),
+            originate_stream_idx=originate_stream_idx,
+        )
 
     def get_materialized_event(self, code: IndentedBuffer) -> str:
         """Allocate or reuse a materialized cuda event."""
-        if self.available_materialized_events:
+        if self._reuse_cuda_event and self.available_materialized_events:
             return self.available_materialized_events.pop()
         else:
             event = EVENT_NAME_TEMPLATE.format(event_idx=next(self.materialized_event_idx))