diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 46e6727..4fb52e7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -name: Run checks on push or PR to main +name: main on: push: diff --git a/.github/workflows/release.yml b/.github/workflows/publish_docs.yml similarity index 69% rename from .github/workflows/release.yml rename to .github/workflows/publish_docs.yml index 82f6637..e703b36 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/publish_docs.yml @@ -1,21 +1,11 @@ -name: release +name: publish_docs on: release: types: [published] + workflow_dispatch: jobs: - release-to-pypi: - runs-on: ubuntu-latest - permissions: - id-token: write - steps: - - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v5 - with: - version: "0.5.29" - - run: uv build - - run: uv publish publish-docs: runs-on: ubuntu-latest diff --git a/.github/workflows/release_to_pypi.yml b/.github/workflows/release_to_pypi.yml new file mode 100644 index 0000000..04d9d2b --- /dev/null +++ b/.github/workflows/release_to_pypi.yml @@ -0,0 +1,18 @@ +name: release_to_pypi + +on: + release: + types: [published] + +jobs: + release-to-pypi: + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + with: + version: "0.5.29" + - run: uv build + - run: uv publish diff --git a/README.md b/README.md index d86e0e5..969d2a5 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,9 @@ def distributed_training(model: nn.Module, num_steps: int = 10) -> nn.Module | N We can distribute and run this function (e.g. on 2 machines x 2 GPUs) using **`torchrunx`**! ```python +import logging +logging.basicConfig(level=logging.INFO) + import torchrunx launcher = torchrunx.Launcher( @@ -136,6 +139,6 @@ torch.save(trained_model.state_dict(), "output/model.pth") > - [Automatic detection of SLURM environments.](https://torchrun.xyz/usage/slurm.html) > - Start multi-node training from Python notebooks! > - Our library is fully typed! -> - Custom, fine-grained handling of logging, environment variables, and exception propagation. We have nice defaults too: no more interleaved logs and irrelevant exceptions! +> - Custom, fine-grained handling of [logging](https://torchrun.xyz/usage/logging.html), [environment variables](https://torchrun.xyz/usage/general.html#environment-variables), and [exception propagation](https://torchrun.xyz/usage/general.html#exceptions). We have nice defaults too: no more interleaved logs and irrelevant exceptions! **On our [roadmap](https://github.com/apoorvkh/torchrunx/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement): higher-order parallelism, support for debuggers, and more!** diff --git a/docs/source/examples/accelerate.md b/docs/source/examples/accelerate.md index cb5e371..0aa88ac 100644 --- a/docs/source/examples/accelerate.md +++ b/docs/source/examples/accelerate.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [Accelerate](https://huggingface.co/docs/accelerate/en/index) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/accelerate_train.py) +[https://torchrun.xyz/accelerate_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/accelerate_train.py)

python accelerate_train.py --help

(expand)
diff --git a/docs/source/examples/deepspeed.md b/docs/source/examples/deepspeed.md index e4cea9b..55cfcd4 100644 --- a/docs/source/examples/deepspeed.md +++ b/docs/source/examples/deepspeed.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [DeepSpeed](https://www.deepspeed.ai) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/deepspeed_train.py) +[https://torchrun.xyz/deepspeed_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/deepspeed_train.py)

python deepspeed_train.py --help

(expand)
diff --git a/docs/source/examples/lightning.md b/docs/source/examples/lightning.md index 7814de2..c5eae53 100644 --- a/docs/source/examples/lightning.md +++ b/docs/source/examples/lightning.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/lightning_train.py) +[https://torchrun.xyz/lightning_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/lightning_train.py)

python lightning_train.py --help

(expand)
diff --git a/docs/source/examples/transformers.md b/docs/source/examples/transformers.md index 097483d..28c1945 100644 --- a/docs/source/examples/transformers.md +++ b/docs/source/examples/transformers.md @@ -2,7 +2,7 @@ Here's an example script that uses `torchrunx` with [`transformers.Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer) to fine-tune any causal language model (from `transformers`) on any text dataset (from `datasets`) with any number of GPUs or nodes. -[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/docs/source/examples/scripts/transformers_train.py) +[https://torchrun.xyz/transformers_train.py](https://raw.githubusercontent.com/apoorvkh/torchrunx/refs/heads/main/scripts/examples/transformers_train.py)

python transformers_train.py --help

(expand)
diff --git a/docs/source/usage/logging.md b/docs/source/usage/logging.md index 0e76493..2352db1 100644 --- a/docs/source/usage/logging.md +++ b/docs/source/usage/logging.md @@ -1,6 +1,6 @@ # Custom Logging -We forward all agent and worker logs (i.e. from {mod}`logging`, {obj}`sys.stdout`, and {obj}`sys.stderr`) to the launcher process. +We forward all agent and worker logs (i.e. from {mod}`logging`, `stdout`, and `stderr`) to the launcher process. ## Defaults diff --git a/pyproject.toml b/pyproject.toml index dfffaf3..4210d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "torchrunx" -version = "0.3.0" +version = "0.3.1" authors = [ { name = "Apoorv Khandelwal", email = "mail@apoorvkh.com" }, { name = "Peter Curtin", email = "peter_curtin@brown.edu" }, diff --git a/scripts/examples/accelerate_train.py b/scripts/examples/accelerate_train.py index a085168..2a0eba7 100644 --- a/scripts/examples/accelerate_train.py +++ b/scripts/examples/accelerate_train.py @@ -13,6 +13,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from pathlib import Path @@ -27,6 +28,8 @@ import torchrunx +logging.basicConfig(level=logging.INFO) + @dataclass class ModelConfig: @@ -114,14 +117,18 @@ def main( output_dir: Path, ): model = AutoModelForCausalLM.from_pretrained(model_config.name) - train_dataset = load_training_data(tokenizer_name=model_config.name, dataset_config=dataset_config) + train_dataset = load_training_data( + tokenizer_name=model_config.name, dataset_config=dataset_config + ) # Launch training - results = launcher.run(train, (model, train_dataset, batch_size, output_dir)) + results = launcher.run(train, model, train_dataset, batch_size, output_dir) # Loading trained model from checkpoint checkpoint_path = results.rank(0) - trained_model = AutoModelForCausalLM.from_pretrained(model_config.name, state_dict=torch.load(checkpoint_path)) + trained_model = AutoModelForCausalLM.from_pretrained( + model_config.name, state_dict=torch.load(checkpoint_path) + ) if __name__ == "__main__": diff --git a/scripts/examples/deepspeed_train.py b/scripts/examples/deepspeed_train.py index 273961f..5ed085f 100644 --- a/scripts/examples/deepspeed_train.py +++ b/scripts/examples/deepspeed_train.py @@ -15,6 +15,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from pathlib import Path @@ -30,6 +31,8 @@ import torchrunx +logging.basicConfig(level=logging.INFO) + @dataclass class DatasetConfig: @@ -111,7 +114,7 @@ def main( train_dataset = load_training_data(tokenizer_name=model_name, dataset_config=dataset_config) # Launch training - launcher.run(train, (model, train_dataset, str(deepspeed_config), str(checkpoint_dir))) + launcher.run(train, model, train_dataset, str(deepspeed_config), str(checkpoint_dir)) # Loading trained model from checkpoint state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) diff --git a/scripts/examples/lightning_train.py b/scripts/examples/lightning_train.py index 1684eb6..1a91e52 100644 --- a/scripts/examples/lightning_train.py +++ b/scripts/examples/lightning_train.py @@ -14,13 +14,13 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from typing import Annotated import lightning as L import torch - import tyro from datasets import load_dataset from torch.utils.data import Dataset @@ -29,6 +29,8 @@ import torchrunx from torchrunx.integrations.lightning import TorchrunxClusterEnvironment +logging.basicConfig(level=logging.INFO) + @dataclass class ModelConfig: @@ -126,7 +128,7 @@ def main( ) # Launch training - results = launcher.run(train, (model, train_dataset)) + results = launcher.run(train, model, train_dataset) # Loading trained model from checkpoint checkpoint_path = results.rank(0) diff --git a/scripts/examples/transformers_train.py b/scripts/examples/transformers_train.py index f93eca6..498c4c1 100644 --- a/scripts/examples/transformers_train.py +++ b/scripts/examples/transformers_train.py @@ -13,6 +13,7 @@ from __future__ import annotations import functools +import logging import os from dataclasses import dataclass from typing import Annotated @@ -30,6 +31,7 @@ import torchrunx +logging.basicConfig(level=logging.INFO) @dataclass class ModelConfig: @@ -102,7 +104,7 @@ def main( ) # Launch training - results = launcher.run(train, (model, train_dataset, training_args)) + results = launcher.run(train, model, train_dataset, training_args) # Loading trained model from checkpoint checkpoint_path = results.rank(0) diff --git a/src/torchrunx/utils/log_streaming.py b/src/torchrunx/utils/log_streaming.py index af5ff52..69f8d58 100644 --- a/src/torchrunx/utils/log_streaming.py +++ b/src/torchrunx/utils/log_streaming.py @@ -10,17 +10,17 @@ ] import logging +import os import pickle import signal import struct import sys -from contextlib import redirect_stderr, redirect_stdout from dataclasses import dataclass -from io import StringIO from logging import Handler, Logger from logging.handlers import SocketHandler from multiprocessing.synchronize import Event as EventClass from socketserver import StreamRequestHandler, ThreadingTCPServer +from threading import Thread from typing import Callable import cloudpickle @@ -129,24 +129,35 @@ def start_logging_server(serialized_args: bytes, stop_event: EventClass) -> None def redirect_stdio_to_logger(logger: Logger) -> None: """Redirect stderr/stdout: send output to logger at every flush.""" - - class _LoggingStream(StringIO): - def __init__(self, logger: Logger, level: int = logging.NOTSET) -> None: - super().__init__() - self.logger = logger - self.level = level - - def flush(self) -> None: - super().flush() # At "flush" to avoid logs of partial bytes - value = self.getvalue() - if value != "": - self.logger.log(self.level, value) - self.truncate(0) - self.seek(0) - logging.captureWarnings(capture=True) - redirect_stderr(_LoggingStream(logger, level=logging.ERROR)).__enter__() - redirect_stdout(_LoggingStream(logger, level=logging.INFO)).__enter__() + + def redirect_fd_to_logger(read_fd: int, level: int) -> None: + for line in os.fdopen(read_fd): + logger.log(level, line.rstrip()) + + # create (r, w) pipe and start logging all outputs from r + read_out_fd, write_out_fd = os.pipe() + Thread( + target=redirect_fd_to_logger, + kwargs={"read_fd": read_out_fd, "level": logging.INFO}, + daemon=True, + ).start() + # flush buffer before redirecting stdout + sys.stdout.flush() + # pipe: r <-> stdout instead of r <-> w + os.dup2(write_out_fd, sys.stdout.fileno()) # set stdout fd to pipe + os.close(write_out_fd) + + # repeat for stderr + read_err_fd, write_err_fd = os.pipe() + Thread( + target=redirect_fd_to_logger, + kwargs={"read_fd": read_err_fd, "level": logging.ERROR}, + daemon=True, + ).start() + sys.stderr.flush() + os.dup2(write_err_fd, sys.stderr.fileno()) + os.close(write_err_fd) @dataclass diff --git a/uv.lock b/uv.lock index 13fe003..c10535d 100644 --- a/uv.lock +++ b/uv.lock @@ -1803,7 +1803,7 @@ wheels = [ [[package]] name = "torchrunx" -version = "0.3.0" +version = "0.3.1" source = { editable = "." } dependencies = [ { name = "cloudpickle" },