From ddebad40e874e2d294a80873e06114a7d310d98b Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 10 Mar 2025 15:59:39 -0400 Subject: [PATCH 01/11] updated links --- docs/source/examples/accelerate.md | 2 +- docs/source/examples/deepspeed.md | 2 +- docs/source/examples/lightning.md | 2 +- docs/source/examples/transformers.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/examples/accelerate.md b/docs/source/examples/accelerate.md index cb5e371..0aa88ac 100644 --- a/docs/source/examples/accelerate.md +++ b/docs/source/examples/accelerate.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [Accelerate](https://huggingface.co/docs/accelerate/en/index) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/accelerate_train.py) +[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/accelerate_train.py)

python accelerate_train.py --help

(expand)
diff --git a/docs/source/examples/deepspeed.md b/docs/source/examples/deepspeed.md index e4cea9b..55cfcd4 100644 --- a/docs/source/examples/deepspeed.md +++ b/docs/source/examples/deepspeed.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [DeepSpeed](https://www.deepspeed.ai) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/deepspeed_train.py) +[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/deepspeed_train.py)

python deepspeed_train.py --help

(expand)
diff --git a/docs/source/examples/lightning.md b/docs/source/examples/lightning.md index 7814de2..c5eae53 100644 --- a/docs/source/examples/lightning.md +++ b/docs/source/examples/lightning.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/lightning_train.py) +[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/lightning_train.py)

python lightning_train.py --help

(expand)
diff --git a/docs/source/examples/transformers.md b/docs/source/examples/transformers.md index 097483d..28c1945 100644 --- a/docs/source/examples/transformers.md +++ b/docs/source/examples/transformers.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [`transformers.Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/transformers_train.py) +[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/transformers_train.py)

python transformers_train.py --help

(expand)
From 820ad6365db9df0aed3de9d51e645895db58dabb Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 10 Mar 2025 16:07:50 -0400 Subject: [PATCH 02/11] split release workflow --- .github/workflows/publish_docs.yml | 28 ++++++++++++++++++++++++++++ .github/workflows/release.yml | 21 --------------------- 2 files changed, 28 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/publish_docs.yml diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml new file mode 100644 index 0000000..67ac957 --- /dev/null +++ b/.github/workflows/publish_docs.yml @@ -0,0 +1,28 @@ +name: release + +on: + release: + types: [published] + +jobs: + + publish-docs: + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + with: + version: "0.5.29" + - run: source ./scripts/build_docs.sh + - uses: actions/configure-pages@v5 + - uses: actions/upload-pages-artifact@v3 + with: + path: docs/_build/html + - id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 82f6637..b26287e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,24 +16,3 @@ jobs: version: "0.5.29" - run: uv build - run: uv publish - - publish-docs: - runs-on: ubuntu-latest - permissions: - pages: write - id-token: write - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v5 - with: - version: "0.5.29" - - run: source ./scripts/build_docs.sh - - uses: actions/configure-pages@v5 - - uses: actions/upload-pages-artifact@v3 - with: - path: docs/_build/html - - id: deployment - uses: actions/deploy-pages@v4 From aca125937bf61dad0dd732669cbe3b1a414b12ea Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 10 Mar 2025 16:08:56 -0400 Subject: [PATCH 03/11] update workflows --- .github/workflows/main.yml | 2 +- .github/workflows/publish_docs.yml | 2 +- .github/workflows/{release.yml => release_to_pypi.yml} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename .github/workflows/{release.yml => release_to_pypi.yml} (93%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 46e6727..4fb52e7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -name: Run checks on push or PR to main +name: main on: push: diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml index 67ac957..6a2a443 100644 --- a/.github/workflows/publish_docs.yml +++ b/.github/workflows/publish_docs.yml @@ -1,4 +1,4 @@ -name: release +name: publish_docs on: release: diff --git a/.github/workflows/release.yml b/.github/workflows/release_to_pypi.yml similarity index 93% rename from .github/workflows/release.yml rename to .github/workflows/release_to_pypi.yml index b26287e..04d9d2b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release_to_pypi.yml @@ -1,4 +1,4 @@ -name: release +name: release_to_pypi on: release: From 67d2d2ac3eb32b15d0aa1fdedd2aea78e014dc78 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 10 Mar 2025 16:11:06 -0400 Subject: [PATCH 04/11] added workflow_dispatch to publish_docs --- .github/workflows/publish_docs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml index 6a2a443..e703b36 100644 --- a/.github/workflows/publish_docs.yml +++ b/.github/workflows/publish_docs.yml @@ -3,6 +3,7 @@ name: publish_docs on: release: types: [published] + workflow_dispatch: jobs: From 5ecc1787f11122478f6ba1321d2670c4d46f49ea Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Sat, 15 Mar 2025 11:43:39 -0400 Subject: [PATCH 05/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d86e0e5..09a10f9 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,6 @@ torch.save(trained_model.state_dict(), "output/model.pth") > - [Automatic detection of SLURM environments.](https://torchrun.xyz/usage/slurm.html) > - Start multi-node training from Python notebooks! > - Our library is fully typed! -> - Custom, fine-grained handling of logging, environment variables, and exception propagation. We have nice defaults too: no more interleaved logs and irrelevant exceptions! +> - Custom, fine-grained handling of [logging](https://torchrun.xyz/usage/logging.html), [environment variables](https://torchrun.xyz/usage/general.html#environment-variables), and [exception propagation](https://torchrun.xyz/usage/general.html#exceptions). We have nice defaults too: no more interleaved logs and irrelevant exceptions! **On our [roadmap](https://github.com/apoorvkh/torchrunx/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement): higher-order parallelism, support for debuggers, and more!** From a13ffbd8db47e9ec156389cfae050f903850cc6e Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 15 Mar 2025 23:27:15 -0400 Subject: [PATCH 06/11] fix examples in docs --- scripts/examples/accelerate_train.py | 2 +- scripts/examples/deepspeed_train.py | 2 +- scripts/examples/lightning_train.py | 2 +- scripts/examples/transformers_train.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py index a085168..5db3a45 100644 --- a/scripts/examples/accelerate_train.py +++ b/scripts/examples/accelerate_train.py @@ -117,7 +117,7 @@ def main( train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config) # Launch training - results = launcher.run(train, (model, train_dataset, batch_size, output_dir)) + results = launcher.run(train, model, train_dataset, batch_size, output_dir) # Loading trained model from checkpoint checkpoint_path = results.rank(0) diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py index 273961f..6967bb2 100644 --- a/scripts/examples/deepspeed_train.py +++ b/scripts/examples/deepspeed_train.py @@ -111,7 +111,7 @@ def main( train_dataset = load_training_data(tokenizer_name=model_name, dataset_config=dataset_config) # Launch training - launcher.run(train, (model, train_dataset, str(deepspeed_config), str(checkpoint_dir))) + launcher.run(train, model, train_dataset, str(deepspeed_config), str(checkpoint_dir)) # Loading trained model from checkpoint state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py index 1684eb6..fc89336 100644 --- a/scripts/examples/lightning_train.py +++ b/scripts/examples/lightning_train.py @@ -126,7 +126,7 @@ def main( ) # Launch training - results = launcher.run(train, (model, train_dataset)) + results = launcher.run(train, model, train_dataset) # Loading trained model from checkpoint checkpoint_path = results.rank(0) diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py index f93eca6..cd61a1e 100644 --- a/scripts/examples/transformers_train.py +++ b/scripts/examples/transformers_train.py @@ -102,7 +102,7 @@ def main( ) # Launch training - results = launcher.run(train, (model, train_dataset, training_args)) + results = launcher.run(train, model, train_dataset, training_args) # Loading trained model from checkpoint checkpoint_path = results.rank(0) From 7882427e981950bf2996be408fc00e05acaac733 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 15 Mar 2025 23:42:07 -0400 Subject: [PATCH 07/11] added logging to examples --- scripts/examples/accelerate_train.py | 11 +++++++++-- scripts/examples/deepspeed_train.py | 3 +++ scripts/examples/lightning_train.py | 4 +++- scripts/examples/transformers_train.py | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py index 5db3a45..2a0eba7 100644 --- a/scripts/examples/accelerate_train.py +++ b/scripts/examples/accelerate_train.py @@ -13,6 +13,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from pathlib import Path @@ -27,6 +28,8 @@ import torchrunx +logging.basicConfig(level=logging.INFO) + @dataclass class ModelConfig: @@ -114,14 +117,18 @@ def main( output_dir: Path, ): model = AutoModelForCausalLM.from_pretrained(model_config.name) - train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config) + train_dataset = load_training_data( + tokenizer_name=model_config.name, dataset_config=dataset_config + ) # Launch training results = launcher.run(train, model, train_dataset, batch_size, output_dir) # Loading trained model from checkpoint checkpoint_path = results.rank(0) - trained_model = AutoModelForCausalLM.from_pretrained(model_config.name, state_dict=torch.load(checkpoint_path)) + trained_model = AutoModelForCausalLM.from_pretrained( + model_config.name, state_dict=torch.load(checkpoint_path) + ) if __name__ == "__main__": diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py index 6967bb2..5ed085f 100644 --- a/scripts/examples/deepspeed_train.py +++ b/scripts/examples/deepspeed_train.py @@ -15,6 +15,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from pathlib import Path @@ -30,6 +31,8 @@ import torchrunx +logging.basicConfig(level=logging.INFO) + @dataclass class DatasetConfig: diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py index fc89336..1a91e52 100644 --- a/scripts/examples/lightning_train.py +++ b/scripts/examples/lightning_train.py @@ -14,13 +14,13 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from typing import Annotated import lightning as L import torch - import tyro from datasets import load_dataset from torch.utils.data import Dataset @@ -29,6 +29,8 @@ import torchrunx from torchrunx.integrations.lightning import TorchrunxClusterEnvironment +logging.basicConfig(level=logging.INFO) + @dataclass class ModelConfig: diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py index cd61a1e..498c4c1 100644 --- a/scripts/examples/transformers_train.py +++ b/scripts/examples/transformers_train.py @@ -13,6 +13,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from typing import Annotated @@ -30,6 +31,7 @@ import torchrunx +logging.basicConfig(level=logging.INFO) @dataclass class ModelConfig: From ac6fac9344f1414ae40a95fd9962321d4c3d00b3 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sun, 16 Mar 2025 15:28:27 -0400 Subject: [PATCH 08/11] initial code for capturing stdout/err from fd instead of sys.stdout/err --- src/torchrunx/utils/log_streaming.py | 49 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/torchrunx/utils/log_streaming.py b/src/torchrunx/utils/log_streaming.py index af5ff52..69f8d58 100644 --- a/src/torchrunx/utils/log_streaming.py +++ b/src/torchrunx/utils/log_streaming.py @@ -10,17 +10,17 @@ ] import logging +import os import pickle import signal import struct import sys -from contextlib import redirect_stderr, redirect_stdout from dataclasses import dataclass -from io import StringIO from logging import Handler, Logger from logging.handlers import SocketHandler from multiprocessing.synchronize import Event as EventClass from socketserver import StreamRequestHandler, ThreadingTCPServer +from threading import Thread from typing import Callable import cloudpickle @@ -129,24 +129,35 @@ def start_logging_server(serialized_args: bytes, stop_event: EventClass) -> None def redirect_stdio_to_logger(logger: Logger) -> None: """Redirect stderr/stdout: send output to logger at every flush.""" - - class _LoggingStream(StringIO): - def __init__(self, logger: Logger, level: int = logging.NOTSET) -> None: - super().__init__() - self.logger = logger - self.level = level - - def flush(self) -> None: - super().flush() # At "flush" to avoid logs of partial bytes - value = self.getvalue() - if value != "": - self.logger.log(self.level, value) - self.truncate(0) - self.seek(0) - logging.captureWarnings(capture=True) - redirect_stderr(_LoggingStream(logger, level=logging.ERROR)).__enter__() - redirect_stdout(_LoggingStream(logger, level=logging.INFO)).__enter__() + + def redirect_fd_to_logger(read_fd: int, level: int) -> None: + for line in os.fdopen(read_fd): + logger.log(level, line.rstrip()) + + # create (r, w) pipe and start logging all outputs from r + read_out_fd, write_out_fd = os.pipe() + Thread( + target=redirect_fd_to_logger, + kwargs={"read_fd": read_out_fd, "level": logging.INFO}, + daemon=True, + ).start() + # flush buffer before redirecting stdout + sys.stdout.flush() + # pipe: r <-> stdout instead of r <-> w + os.dup2(write_out_fd, sys.stdout.fileno()) # set stdout fd to pipe + os.close(write_out_fd) + + # repeat for stderr + read_err_fd, write_err_fd = os.pipe() + Thread( + target=redirect_fd_to_logger, + kwargs={"read_fd": read_err_fd, "level": logging.ERROR}, + daemon=True, + ).start() + sys.stderr.flush() + os.dup2(write_err_fd, sys.stderr.fileno()) + os.close(write_err_fd) @dataclass From 10e5d33794c9152ba7315792e3d70574957c0282 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sun, 16 Mar 2025 15:29:40 -0400 Subject: [PATCH 09/11] bump version to 0.3.1 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfffaf3..4210d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "torchrunx" -version = "0.3.0" +version = "0.3.1" authors = [ { name = "Apoorv Khandelwal", email = "mail@apoorvkh.com" }, { name = "Peter Curtin", email = "peter_curtin@brown.edu" }, diff --git a/uv.lock b/uv.lock index 13fe003..c10535d 100644 --- a/uv.lock +++ b/uv.lock @@ -1803,7 +1803,7 @@ wheels = [ [[package]] name = "torchrunx" -version = "0.3.0" +version = "0.3.1" source = { editable = "." } dependencies = [ { name = "cloudpickle" }, From b0e1657ecaac645fee28661690f28469cba1be5c Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 17 Mar 2025 01:52:42 -0400 Subject: [PATCH 10/11] added logging info to readme example --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 09a10f9..969d2a5 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,9 @@ def distributed_training(model: nn.Module, num_steps: int = 10) -> nn.Module | N We can distribute and run this function (e.g. on 2 machines x 2 GPUs) using **`torchrunx`**! ```python +import logging +logging.basicConfig(level=logging.INFO) + import torchrunx launcher = torchrunx.Launcher( From 62bd030833c909643f5e9f3b83d4480dd8cbb059 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Mon, 17 Mar 2025 01:55:08 -0400 Subject: [PATCH 11/11] edit logging docs --- docs/source/usage/logging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/logging.md b/docs/source/usage/logging.md index 0e76493..2352db1 100644 --- a/docs/source/usage/logging.md +++ b/docs/source/usage/logging.md @@ -1,6 +1,6 @@ # Custom Logging -We forward all agent and worker logs (i.e. from {mod}`logging`, {obj}`sys.stdout`, and {obj}`sys.stderr`) to the launcher process. +We forward all agent and worker logs (i.e. from {mod}`logging`, `stdout`, and `stderr`) to the launcher process. ## Defaults