diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 46e6727..4fb52e7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,4 +1,4 @@
-name: Run checks on push or PR to main
+name: main
 
 on:
   push:
diff --git a/.github/workflows/release.yml b/.github/workflows/publish_docs.yml
similarity index 69%
rename from .github/workflows/release.yml
rename to .github/workflows/publish_docs.yml
index 82f6637..e703b36 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/publish_docs.yml
@@ -1,21 +1,11 @@
-name: release
+name: publish_docs
 
 on:
   release:
     types: [published]
+  workflow_dispatch:
 
 jobs:
-  release-to-pypi:
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-    steps:
-      - uses: actions/checkout@v4
-      - uses: astral-sh/setup-uv@v5
-        with:
-          version: "0.5.29"
-      - run: uv build
-      - run: uv publish
 
   publish-docs:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/release_to_pypi.yml b/.github/workflows/release_to_pypi.yml
new file mode 100644
index 0000000..04d9d2b
--- /dev/null
+++ b/.github/workflows/release_to_pypi.yml
@@ -0,0 +1,18 @@
+name: release_to_pypi
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  release-to-pypi:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+        with:
+          version: "0.5.29"
+      - run: uv build
+      - run: uv publish
diff --git a/README.md b/README.md
index d86e0e5..969d2a5 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,9 @@ def distributed_training(model: nn.Module, num_steps: int = 10) -> nn.Module | N
 We can distribute and run this function (e.g. on 2 machines x 2 GPUs) using **`torchrunx`**!
 
 ```python
+import logging
+logging.basicConfig(level=logging.INFO)
+
 import torchrunx
 
 launcher = torchrunx.Launcher(
@@ -136,6 +139,6 @@ torch.save(trained_model.state_dict(), "output/model.pth")
 > - [Automatic detection of SLURM environments.](https://torchrun.xyz/usage/slurm.html)
 > - Start multi-node training from Python notebooks!
 > - Our library is fully typed!
-> - Custom, fine-grained handling of logging, environment variables, and exception propagation. We have nice defaults too: no more interleaved logs and irrelevant exceptions!
+> - Custom, fine-grained handling of [logging](https://torchrun.xyz/usage/logging.html), [environment variables](https://torchrun.xyz/usage/general.html#environment-variables), and [exception propagation](https://torchrun.xyz/usage/general.html#exceptions). We have nice defaults too: no more interleaved logs and irrelevant exceptions!
 
 **On our [roadmap](https://github.com/apoorvkh/torchrunx/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement): higher-order parallelism, support for debuggers, and more!**
diff --git a/docs/source/examples/accelerate.md b/docs/source/examples/accelerate.md
index cb5e371..0aa88ac 100644
--- a/docs/source/examples/accelerate.md
+++ b/docs/source/examples/accelerate.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [Accelerate](https://huggingface.co/docs/accelerate/en/index) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/accelerate_train.py)
+[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/accelerate_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python accelerate_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/deepspeed.md b/docs/source/examples/deepspeed.md
index e4cea9b..55cfcd4 100644
--- a/docs/source/examples/deepspeed.md
+++ b/docs/source/examples/deepspeed.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [DeepSpeed](https://www.deepspeed.ai) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/deepspeed_train.py)
+[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/deepspeed_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python deepspeed_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/lightning.md b/docs/source/examples/lightning.md
index 7814de2..c5eae53 100644
--- a/docs/source/examples/lightning.md
+++ b/docs/source/examples/lightning.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/lightning_train.py)
+[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/lightning_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python lightning_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/examples/transformers.md b/docs/source/examples/transformers.md
index 097483d..28c1945 100644
--- a/docs/source/examples/transformers.md
+++ b/docs/source/examples/transformers.md
@@ -2,7 +2,7 @@
 
 Here's an example script that uses `torchrunx` with [`transformers.Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes.
 
-[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/transformers_train.py)
+[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/transformers_train.py)
 
 <details>
   <summary><p style="display: inline-block;"><code class="docutils literal notranslate"><span class="pre">python transformers_train.py --help</span></code></p> (expand)</summary>
diff --git a/docs/source/usage/logging.md b/docs/source/usage/logging.md
index 0e76493..2352db1 100644
--- a/docs/source/usage/logging.md
+++ b/docs/source/usage/logging.md
@@ -1,6 +1,6 @@
 # Custom Logging
 
-We forward all agent and worker logs (i.e. from {mod}`logging`, {obj}`sys.stdout`, and {obj}`sys.stderr`) to the launcher process.
+We forward all agent and worker logs (i.e. from {mod}`logging`, `stdout`, and `stderr`) to the launcher process.
 
 ## Defaults
 
diff --git a/pyproject.toml b/pyproject.toml
index dfffaf3..4210d4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "torchrunx"
-version = "0.3.0"
+version = "0.3.1"
 authors = [
   { name = "Apoorv Khandelwal", email = "mail@apoorvkh.com" },
   { name = "Peter Curtin", email = "peter_curtin@brown.edu" },
diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py
index a085168..2a0eba7 100644
--- a/scripts/examples/accelerate_train.py
+++ b/scripts/examples/accelerate_train.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
@@ -27,6 +28,8 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class ModelConfig:
@@ -114,14 +117,18 @@ def main(
     output_dir: Path,
 ):
     model = AutoModelForCausalLM.from_pretrained(model_config.name)
-    train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config)
+    train_dataset = load_training_data(
+        tokenizer_name=model_config.name, dataset_config=dataset_config
+    )
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset, batch_size, output_dir))
+    results = launcher.run(train, model, train_dataset, batch_size, output_dir)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
-    trained_model = AutoModelForCausalLM.from_pretrained(model_config.name, state_dict=torch.load(checkpoint_path))
+    trained_model = AutoModelForCausalLM.from_pretrained(
+        model_config.name, state_dict=torch.load(checkpoint_path)
+    )
 
 
 if __name__ == "__main__":
diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py
index 273961f..5ed085f 100644
--- a/scripts/examples/deepspeed_train.py
+++ b/scripts/examples/deepspeed_train.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
@@ -30,6 +31,8 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class DatasetConfig:
@@ -111,7 +114,7 @@ def main(
     train_dataset = load_training_data(tokenizer_name=model_name, dataset_config=dataset_config)
 
     # Launch training
-    launcher.run(train, (model, train_dataset, str(deepspeed_config), str(checkpoint_dir)))
+    launcher.run(train, model, train_dataset, str(deepspeed_config), str(checkpoint_dir))
 
     # Loading trained model from checkpoint
     state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py
index 1684eb6..1a91e52 100644
--- a/scripts/examples/lightning_train.py
+++ b/scripts/examples/lightning_train.py
@@ -14,13 +14,13 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from typing import Annotated
 
 import lightning as L
 import torch
-
 import tyro
 from datasets import load_dataset
 from torch.utils.data import Dataset
@@ -29,6 +29,8 @@
 import torchrunx
 from torchrunx.integrations.lightning import TorchrunxClusterEnvironment
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass
 class ModelConfig:
@@ -126,7 +128,7 @@ def main(
     )
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset))
+    results = launcher.run(train, model, train_dataset)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py
index f93eca6..498c4c1 100644
--- a/scripts/examples/transformers_train.py
+++ b/scripts/examples/transformers_train.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import functools
+import logging
 import os
 from dataclasses import dataclass
 from typing import Annotated
@@ -30,6 +31,7 @@
 
 import torchrunx
 
+logging.basicConfig(level=logging.INFO)
 
 @dataclass
 class ModelConfig:
@@ -102,7 +104,7 @@ def main(
     )
 
     # Launch training
-    results = launcher.run(train, (model, train_dataset, training_args))
+    results = launcher.run(train, model, train_dataset, training_args)
 
     # Loading trained model from checkpoint
     checkpoint_path = results.rank(0)
diff --git a/src/torchrunx/utils/log_streaming.py b/src/torchrunx/utils/log_streaming.py
index af5ff52..69f8d58 100644
--- a/src/torchrunx/utils/log_streaming.py
+++ b/src/torchrunx/utils/log_streaming.py
@@ -10,17 +10,17 @@
 ]
 
 import logging
+import os
 import pickle
 import signal
 import struct
 import sys
-from contextlib import redirect_stderr, redirect_stdout
 from dataclasses import dataclass
-from io import StringIO
 from logging import Handler, Logger
 from logging.handlers import SocketHandler
 from multiprocessing.synchronize import Event as EventClass
 from socketserver import StreamRequestHandler, ThreadingTCPServer
+from threading import Thread
 from typing import Callable
 
 import cloudpickle
@@ -129,24 +129,35 @@ def start_logging_server(serialized_args: bytes, stop_event: EventClass) -> None
 
 def redirect_stdio_to_logger(logger: Logger) -> None:
     """Redirect stderr/stdout: send output to logger at every flush."""
-
-    class _LoggingStream(StringIO):
-        def __init__(self, logger: Logger, level: int = logging.NOTSET) -> None:
-            super().__init__()
-            self.logger = logger
-            self.level = level
-
-        def flush(self) -> None:
-            super().flush()  # At "flush" to avoid logs of partial bytes
-            value = self.getvalue()
-            if value != "":
-                self.logger.log(self.level, value)
-                self.truncate(0)
-                self.seek(0)
-
     logging.captureWarnings(capture=True)
-    redirect_stderr(_LoggingStream(logger, level=logging.ERROR)).__enter__()
-    redirect_stdout(_LoggingStream(logger, level=logging.INFO)).__enter__()
+
+    def redirect_fd_to_logger(read_fd: int, level: int) -> None:
+        for line in os.fdopen(read_fd):
+            logger.log(level, line.rstrip())
+
+    # create (r, w) pipe and start logging all outputs from r
+    read_out_fd, write_out_fd = os.pipe()
+    Thread(
+        target=redirect_fd_to_logger,
+        kwargs={"read_fd": read_out_fd, "level": logging.INFO},
+        daemon=True,
+    ).start()
+    # flush buffer before redirecting stdout
+    sys.stdout.flush()
+    # pipe: r <-> stdout instead of r <-> w
+    os.dup2(write_out_fd, sys.stdout.fileno())  # set stdout fd to pipe
+    os.close(write_out_fd)
+
+    # repeat for stderr
+    read_err_fd, write_err_fd = os.pipe()
+    Thread(
+        target=redirect_fd_to_logger,
+        kwargs={"read_fd": read_err_fd, "level": logging.ERROR},
+        daemon=True,
+    ).start()
+    sys.stderr.flush()
+    os.dup2(write_err_fd, sys.stderr.fileno())
+    os.close(write_err_fd)
 
 
 @dataclass
diff --git a/uv.lock b/uv.lock
index 13fe003..c10535d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1803,7 +1803,7 @@ wheels = [
 
 [[package]]
 name = "torchrunx"
-version = "0.3.0"
+version = "0.3.1"
 source = { editable = "." }
 dependencies = [
     { name = "cloudpickle" },