From ddebad40e874e2d294a80873e06114a7d310d98b Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 10 Mar 2025 15:59:39 -0400
Subject: [PATCH 01/11] updated links

---
 docs/source/examples/accelerate.md   | 2 +-
 docs/source/examples/deepspeed.md    | 2 +-
 docs/source/examples/lightning.md    | 2 +-
 docs/source/examples/transformers.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/examples/accelerate.md b/docs/source/examples/accelerate.md
index cb5e371..0aa88ac 100644
--- a/docs/source/examples/accelerate.md
+++ b/docs/source/examples/accelerate.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [Accelerate](https://huggingface.co/docs/accelerate/en/index) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/accelerate_train.py)
+[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/accelerate_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python accelerate_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/deepspeed.md b/docs/source/examples/deepspeed.md
index e4cea9b..55cfcd4 100644
--- a/docs/source/examples/deepspeed.md
+++ b/docs/source/examples/deepspeed.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [DeepSpeed](https://www.deepspeed.ai) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/deepspeed_train.py)
+[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/deepspeed_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python deepspeed_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/lightning.md b/docs/source/examples/lightning.md
index 7814de2..c5eae53 100644
--- a/docs/source/examples/lightning.md
+++ b/docs/source/examples/lightning.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/lightning_train.py)
+[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/lightning_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python lightning_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/transformers.md b/docs/source/examples/transformers.md
index 097483d..28c1945 100644
--- a/docs/source/examples/transformers.md
+++ b/docs/source/examples/transformers.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [`transformers.Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/transformers_train.py)
+[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/transformers_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python transformers_train.py --help</span></code></p> (expand)</summary>

From 820ad6365db9df0aed3de9d51e645895db58dabb Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 10 Mar 2025 16:07:50 -0400
Subject: [PATCH 02/11] split release workflow

---
 .github/workflows/publish_docs.yml | 28 ++++++++++++++++++++++++++++
 .github/workflows/release.yml      | 21 ---------------------
 2 files changed, 28 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/publish_docs.yml

diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
new file mode 100644
index 0000000..67ac957
--- /dev/null
+++ b/.github/workflows/publish_docs.yml
@@ -0,0 +1,28 @@
+name: release
+
+on:
+  release:
+    types: [published]
+
+jobs:
+
+  publish-docs:
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+        with:
+          version: "0.5.29"
+      - run: source ./scripts/build_docs.sh
+      - uses: actions/configure-pages@v5
+      - uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/_build/html
+      - id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 82f6637..b26287e 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,24 +16,3 @@ jobs:
           version: "0.5.29"
       - run: uv build
       - run: uv publish
-
-  publish-docs:
-    runs-on: ubuntu-latest
-    permissions:
-      pages: write
-      id-token: write
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: astral-sh/setup-uv@v5
-        with:
-          version: "0.5.29"
-      - run: source ./scripts/build_docs.sh
-      - uses: actions/configure-pages@v5
-      - uses: actions/upload-pages-artifact@v3
-        with:
-          path: docs/_build/html
-      - id: deployment
-        uses: actions/deploy-pages@v4

From aca125937bf61dad0dd732669cbe3b1a414b12ea Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 10 Mar 2025 16:08:56 -0400
Subject: [PATCH 03/11] update workflows

---
 .github/workflows/main.yml                             | 2 +-
 .github/workflows/publish_docs.yml                     | 2 +-
 .github/workflows/{release.yml => release_to_pypi.yml} | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename .github/workflows/{release.yml => release_to_pypi.yml} (93%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 46e6727..4fb52e7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,4 +1,4 @@
-name: Run checks on push or PR to main
+name: main
 
 on:
   push:
diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
index 67ac957..6a2a443 100644
--- a/.github/workflows/publish_docs.yml
+++ b/.github/workflows/publish_docs.yml
@@ -1,4 +1,4 @@
-name: release
+name: publish_docs
 
 on:
   release:
diff --git a/.github/workflows/release.yml b/.github/workflows/release_to_pypi.yml
similarity index 93%
rename from .github/workflows/release.yml
rename to .github/workflows/release_to_pypi.yml
index b26287e..04d9d2b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release_to_pypi.yml
@@ -1,4 +1,4 @@
-name: release
+name: release_to_pypi
 
 on:
   release:

From 67d2d2ac3eb32b15d0aa1fdedd2aea78e014dc78 Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 10 Mar 2025 16:11:06 -0400
Subject: [PATCH 04/11] added workflow_dispatch to publish_docs

---
 .github/workflows/publish_docs.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
index 6a2a443..e703b36 100644
--- a/.github/workflows/publish_docs.yml
+++ b/.github/workflows/publish_docs.yml
@@ -3,6 +3,7 @@ name: publish_docs
 on:
   release:
     types: [published]
+  workflow_dispatch:
 
 jobs:
 

From 5ecc1787f11122478f6ba1321d2670c4d46f49ea Mon Sep 17 00:00:00 2001
From: Apoorv Khandelwal <mail@apoorvkh.com>
Date: Sat, 15 Mar 2025 11:43:39 -0400
Subject: [PATCH 05/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d86e0e5..09a10f9 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,6 @@ torch.save(trained_model.state_dict(), "output/model.pth")
 > - [Automatic detection of SLURM environments.](https://torchrun.xyz/usage/slurm.html)
 > - Start multi-node training from Python notebooks!
 > - Our library is fully typed!
-> - Custom, fine-grained handling of logging, environment variables, and exception propagation. We have nice defaults too: no more interleaved logs and irrelevant exceptions!
+> - Custom, fine-grained handling of [logging](https://torchrun.xyz/usage/logging.html), [environment variables](https://torchrun.xyz/usage/general.html#environment-variables), and [exception propagation](https://torchrun.xyz/usage/general.html#exceptions). We have nice defaults too: no more interleaved logs and irrelevant exceptions!
 
 **On our [roadmap](https://github.com/apoorvkh/torchrunx/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement): higher-order parallelism, support for debuggers, and more!**

From a13ffbd8db47e9ec156389cfae050f903850cc6e Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Sat, 15 Mar 2025 23:27:15 -0400
Subject: [PATCH 06/11] fix examples in docs

---
 scripts/examples/accelerate_train.py   | 2 +-
 scripts/examples/deepspeed_train.py    | 2 +-
 scripts/examples/lightning_train.py    | 2 +-
 scripts/examples/transformers_train.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py
index a085168..5db3a45 100644
--- a/scripts/examples/accelerate_train.py
+++ b/scripts/examples/accelerate_train.py
@@ -117,7 +117,7 @@ def main(
     train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config)
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset, batch_size, output_dir))
+    results = launcher.run(train, model, train_dataset, batch_size, output_dir)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py
index 273961f..6967bb2 100644
--- a/scripts/examples/deepspeed_train.py
+++ b/scripts/examples/deepspeed_train.py
@@ -111,7 +111,7 @@ def main(
     train_dataset = load_training_data(tokenizer_name=model_name, dataset_config=dataset_config)
 
     # Launch training
-    launcher.run(train, (model, train_dataset, str(deepspeed_config), str(checkpoint_dir)))
+    launcher.run(train, model, train_dataset, str(deepspeed_config), str(checkpoint_dir))
 
     # Loading trained model from checkpoint
     state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py
index 1684eb6..fc89336 100644
--- a/scripts/examples/lightning_train.py
+++ b/scripts/examples/lightning_train.py
@@ -126,7 +126,7 @@ def main(
     )
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset))
+    results = launcher.run(train, model, train_dataset)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py
index f93eca6..cd61a1e 100644
--- a/scripts/examples/transformers_train.py
+++ b/scripts/examples/transformers_train.py
@@ -102,7 +102,7 @@ def main(
     )
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset, training_args))
+    results = launcher.run(train, model, train_dataset, training_args)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)

From 7882427e981950bf2996be408fc00e05acaac733 Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Sat, 15 Mar 2025 23:42:07 -0400
Subject: [PATCH 07/11] added logging to examples

---
 scripts/examples/accelerate_train.py   | 11 +++++++++--
 scripts/examples/deepspeed_train.py    |  3 +++
 scripts/examples/lightning_train.py    |  4 +++-
 scripts/examples/transformers_train.py |  2 ++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py
index 5db3a45..2a0eba7 100644
--- a/scripts/examples/accelerate_train.py
+++ b/scripts/examples/accelerate_train.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
@@ -27,6 +28,8 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class ModelConfig:
@@ -114,14 +117,18 @@ def main(
     output_dir: Path,
 ):
     model = AutoModelForCausalLM.from_pretrained(model_config.name)
-    train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config)
+    train_dataset = load_training_data(
+        tokenizer_name=model_config.name, dataset_config=dataset_config
+    )
 
     # Launch training
     results = launcher.run(train, model, train_dataset, batch_size, output_dir)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
-    trained_model = AutoModelForCausalLM.from_pretrained(model_config.name, state_dict=torch.load(checkpoint_path))
+    trained_model = AutoModelForCausalLM.from_pretrained(
+        model_config.name, state_dict=torch.load(checkpoint_path)
+    )
 
 
 if __name__ == "__main__":
diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py
index 6967bb2..5ed085f 100644
--- a/scripts/examples/deepspeed_train.py
+++ b/scripts/examples/deepspeed_train.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
@@ -30,6 +31,8 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class DatasetConfig:
diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py
index fc89336..1a91e52 100644
--- a/scripts/examples/lightning_train.py
+++ b/scripts/examples/lightning_train.py
@@ -14,13 +14,13 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from typing import Annotated
 
 import lightning as L
 import torch
-
 import tyro
 from datasets import load_dataset
 from torch.utils.data import Dataset
@@ -29,6 +29,8 @@
 import torchrunx
 from torchrunx.integrations.lightning import TorchrunxClusterEnvironment
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class ModelConfig:
diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py
index cd61a1e..498c4c1 100644
--- a/scripts/examples/transformers_train.py
+++ b/scripts/examples/transformers_train.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from typing import Annotated
@@ -30,6 +31,7 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
 
 @dataclass
 class ModelConfig:

From ac6fac9344f1414ae40a95fd9962321d4c3d00b3 Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Sun, 16 Mar 2025 15:28:27 -0400
Subject: [PATCH 08/11] initial code for capturing stdout/err from fd instead
 of sys.stdout/err

---
 src/torchrunx/utils/log_streaming.py | 49 +++++++++++++++++-----------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/torchrunx/utils/log_streaming.py b/src/torchrunx/utils/log_streaming.py
index af5ff52..69f8d58 100644
--- a/src/torchrunx/utils/log_streaming.py
+++ b/src/torchrunx/utils/log_streaming.py
@@ -10,17 +10,17 @@
 ]
 
 import logging
+import os
 import pickle
 import signal
 import struct
 import sys
-from contextlib import redirect_stderr, redirect_stdout
 from dataclasses import dataclass
-from io import StringIO
 from logging import Handler, Logger
 from logging.handlers import SocketHandler
 from multiprocessing.synchronize import Event as EventClass
 from socketserver import StreamRequestHandler, ThreadingTCPServer
+from threading import Thread
 from typing import Callable
 
 import cloudpickle
@@ -129,24 +129,35 @@ def start_logging_server(serialized_args: bytes, stop_event: EventClass) -> None
 
 def redirect_stdio_to_logger(logger: Logger) -> None:
     """Redirect stderr/stdout: send output to logger at every flush."""
-
-    class _LoggingStream(StringIO):
-        def __init__(self, logger: Logger, level: int = logging.NOTSET) -> None:
-            super().__init__()
-            self.logger = logger
-            self.level = level
-
-        def flush(self) -> None:
-            super().flush()  # At "flush" to avoid logs of partial bytes
-            value = self.getvalue()
-            if value != "":
-                self.logger.log(self.level, value)
-                self.truncate(0)
-                self.seek(0)
-
     logging.captureWarnings(capture=True)
-    redirect_stderr(_LoggingStream(logger, level=logging.ERROR)).__enter__()
-    redirect_stdout(_LoggingStream(logger, level=logging.INFO)).__enter__()
+
+    def redirect_fd_to_logger(read_fd: int, level: int) -> None:
+        for line in os.fdopen(read_fd):
+            logger.log(level, line.rstrip())
+
+    # create (r, w) pipe and start logging all outputs from r
+    read_out_fd, write_out_fd = os.pipe()
+    Thread(
+        target=redirect_fd_to_logger,
+        kwargs={"read_fd": read_out_fd, "level": logging.INFO},
+        daemon=True,
+    ).start()
+    # flush buffer before redirecting stdout
+    sys.stdout.flush()
+    # pipe: r <-> stdout instead of r <-> w
+    os.dup2(write_out_fd, sys.stdout.fileno())  # set stdout fd to pipe
+    os.close(write_out_fd)
+
+    # repeat for stderr
+    read_err_fd, write_err_fd = os.pipe()
+    Thread(
+        target=redirect_fd_to_logger,
+        kwargs={"read_fd": read_err_fd, "level": logging.ERROR},
+        daemon=True,
+    ).start()
+    sys.stderr.flush()
+    os.dup2(write_err_fd, sys.stderr.fileno())
+    os.close(write_err_fd)
 
 
 @dataclass

From 10e5d33794c9152ba7315792e3d70574957c0282 Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Sun, 16 Mar 2025 15:29:40 -0400
Subject: [PATCH 09/11] bump version to 0.3.1

---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dfffaf3..4210d4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "torchrunx"
-version = "0.3.0"
+version = "0.3.1"
 authors = [
   { name = "Apoorv Khandelwal", email = "mail@apoorvkh.com" },
   { name = "Peter Curtin", email = "peter_curtin@brown.edu" },
diff --git a/uv.lock b/uv.lock
index 13fe003..c10535d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1803,7 +1803,7 @@ wheels = [
 
 [[package]]
 name = "torchrunx"
-version = "0.3.0"
+version = "0.3.1"
 source = { editable = "." }
 dependencies = [
     { name = "cloudpickle" },

From b0e1657ecaac645fee28661690f28469cba1be5c Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 17 Mar 2025 01:52:42 -0400
Subject: [PATCH 10/11] added logging info to readme example

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 09a10f9..969d2a5 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,9 @@ def distributed_training(model: nn.Module, num_steps: int = 10) -> nn.Module | N
 We can distribute and run this function (e.g. on 2 machines x 2 GPUs) using **`torchrunx`**!
 
 ```python
+import logging
+logging.basicConfig(level=logging.INFO)
+
 import torchrunx
 
 launcher = torchrunx.Launcher(

From 62bd030833c909643f5e9f3b83d4480dd8cbb059 Mon Sep 17 00:00:00 2001
From: apoorvkh <mail@apoorvkh.com>
Date: Mon, 17 Mar 2025 01:55:08 -0400
Subject: [PATCH 11/11] edit logging docs

---
 docs/source/usage/logging.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/logging.md b/docs/source/usage/logging.md
index 0e76493..2352db1 100644
--- a/docs/source/usage/logging.md
+++ b/docs/source/usage/logging.md
@@ -1,6 +1,6 @@
 # Custom Logging
 
-We forward all agent and worker logs (i.e. from {mod}`logging`, {obj}`sys.stdout`, and {obj}`sys.stderr`) to the launcher process.
+We forward all agent and worker logs (i.e. from {mod}`logging`, `stdout`, and `stderr`) to the launcher process.
 
 ## Defaults