From 5d1ca04f7609c88da2d3186f2503725690b16ac8 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:11:09 -0400 Subject: [PATCH 01/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 51f1f772..6ba8cdab 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Shared filesystem & SSH access if using multiple machines ## Minimal example -Here's a simple example where we distribute `distributed_function` to two hosts (with 2 GPUs each): +Here's a simple example where we distribute `train_model` to two hosts (with 2 GPUs each): ```python def train_model(model, dataset): From c2a46a0d4d6c4aec4121033784a398025bf7db58 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:42:25 -0400 Subject: [PATCH 02/50] Update launcher.py update docstrings in laucher.py --- src/torchrunx/launcher.py | 40 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index cd4a6098..879dd011 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -182,13 +182,13 @@ def run( # noqa: C901, PLR0912 :param func: The distributed function to call on all workers :type func: Callable - :param func_args: Any positional arguments to be provided when calling ``func`` - :type func_args: tuple[Any] - :param func_kwargs: Any keyword arguments to be provided when calling ``func`` - :type func_kwargs: dict[str, Any] + :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None + :type func_args: tuple[Any] | None + :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None + :type func_kwargs: dict[str, Any] | None :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output - :rtype: dict[int, Any] + :rtype: LaunchResult """ if not dist.is_available(): msg = "The torch.distributed package is not available." @@ -335,31 +335,29 @@ def launch( :param func: The distributed function to call on all workers :type func: Callable - :param func_args: Any positional arguments to be provided when calling ``func`` - :type func_args: tuple[Any] - :param func_kwargs: Any keyword arguments to be provided when calling ``func`` - :type func_kwargs: dict[str, Any] - :param auto: Automatically determine allocation sizes, supports Slurm allocation. ``hostnames`` and ``workers_per_host`` are automatically assigned if they're set to ``None``, defaults to None - :type auto: bool, optional - :param hostnames: A list of node hostnames to start workers on, defaults to ["localhost"] - :type hostnames: list[str] | Literal["auto", "slurm"] | None, optional - :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to 1 + :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None + :type func_args: tuple[Any] | None, optional + :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None + :type func_kwargs: dict[str, Any] | None, optional + :param hostnames: A list of node hostnames to start workers on, or a string indicating whether to use slurm or automatically decide the hosts, defaults to "auto" + :type hostnames: list[str] | Literal["auto", "slurm"], optional + :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to auto :type workers_per_host: int | list[int] | Literal["auto", "slurm"] | None, optional :param ssh_config_file: An SSH configuration file to use when connecting to nodes, defaults to None :type ssh_config_file: str | os.PathLike | None, optional - :param backend: A ``torch.distributed`` `backend string `_, defaults to None - :type backend: Literal['mpi', 'gloo', 'nccl', 'ucc', None], optional - :param log_handlers: A list of handlers to manage agent and worker logs, defaults to [] - :type log_handlers: list[Handler] | Literal["auto"], optional - :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ["PATH", "LD_LIBRARY", "LIBRARY_PATH", "PYTHON*", "CUDA*", "TORCH*", "PYTORCH*", "NCCL*"] - :type env_vars: list[str], optional + :param backend: A ``torch.distributed`` `backend string `_. If ``None``, doesn't initialize a process group for you, defaults to "auto" + :type backend: Literal['nccl', 'gloo', 'mpi', 'ucc', 'auto'] | None, optional + :param log_handlers: A list of handlers to manage agent and worker logs, or ``"auto"`` to use an automatic basic logging scheme, defaults to "auto" + :type log_handlers: list[Handler] | Literal["auto"] | None, optional + :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ("PATH", "LD_LIBRARY", "LIBRARY_PATH", "PYTHON*", "CUDA*", "TORCH*", "PYTORCH*", "NCCL*") + :type env_vars: Tuple[str], optional :param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to None :type env_file: str | os.PathLike | None, optional :param timeout: Worker process group timeout, defaults to 600 :type timeout: int, optional :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output - :rtype: dict[int, Any] + :rtype: LaunchResult """ # noqa: E501 return Launcher( hostnames=hostnames, From 8121683de2eb416916e9925a70aff5628360b54b Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:48:10 -0400 Subject: [PATCH 03/50] Update api.rst --- docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 5726f26b..293b6b52 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -5,4 +5,4 @@ API TODO: examples, environmental variables available to workers (e.g. RANK, LOCAL_RANK) .. automodule:: torchrunx - :members: launch, slurm_hosts, slurm_workers \ No newline at end of file + :members: launch(func), slurm_hosts, slurm_workers From 44bd676ccf9e3a47e224a66e6f35a2ad187ce2d1 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:50:33 -0400 Subject: [PATCH 04/50] Update api.rst --- docs/source/api.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 293b6b52..5a184eba 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,5 +4,7 @@ API .. TODO: examples, environmental variables available to workers (e.g. RANK, LOCAL_RANK) +.. autofunction:: torchrunx.launch(func) + .. automodule:: torchrunx - :members: launch(func), slurm_hosts, slurm_workers + :members: slurm_hosts, slurm_workers From 855235978f371b670b27c7560956c594da6c9869 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:52:20 -0400 Subject: [PATCH 05/50] Update api.rst --- docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 5a184eba..17b9a97d 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,7 +4,7 @@ API .. TODO: examples, environmental variables available to workers (e.g. RANK, LOCAL_RANK) -.. autofunction:: torchrunx.launch(func) +.. autofunction:: torchrunx.launch(func: Callable) .. automodule:: torchrunx :members: slurm_hosts, slurm_workers From 2a1061b140539d57ed15def9245865019ae1ec5d Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:54:12 -0400 Subject: [PATCH 06/50] Update api.rst --- docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 17b9a97d..9490944d 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,7 +4,7 @@ API .. TODO: examples, environmental variables available to workers (e.g. RANK, LOCAL_RANK) -.. autofunction:: torchrunx.launch(func: Callable) +.. autofunction:: torchrunx.launch(func: Callable, ...) .. automodule:: torchrunx :members: slurm_hosts, slurm_workers From 15411b58e5e02c557cad4b0a417107790f7ff65d Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:58:23 -0400 Subject: [PATCH 07/50] Update api.rst --- docs/source/api.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 9490944d..aed80e9c 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -1,10 +1,11 @@ API ============= -.. - TODO: examples, environmental variables available to workers (e.g. RANK, LOCAL_RANK) +.. TODO: examples .. autofunction:: torchrunx.launch(func: Callable, ...) +.. autoclass:: torchrunx.LaunchResult + .. automodule:: torchrunx :members: slurm_hosts, slurm_workers From de0b14012c67f83692bd9b3f93ca544df85f1870 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:00:03 -0400 Subject: [PATCH 08/50] Update api.rst --- docs/source/api.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index aed80e9c..baa8d18f 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -6,6 +6,3 @@ API .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult - -.. automodule:: torchrunx - :members: slurm_hosts, slurm_workers From 9419554ad5e69910eaed7f55391bb28cdb780352 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:04:11 -0400 Subject: [PATCH 09/50] Update __init__.py --- src/torchrunx/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/torchrunx/__init__.py b/src/torchrunx/__init__.py index 74214cb8..e9bc8fd1 100644 --- a/src/torchrunx/__init__.py +++ b/src/torchrunx/__init__.py @@ -1,9 +1,10 @@ -from .launcher import Launcher, launch +from .launcher import Launcher, launch, LaunchResult from .logging_utils import add_filter_to_handler, file_handler, stream_handler __all__ = [ "Launcher", "launch", + "LaunchResult", "add_filter_to_handler", "file_handler", "stream_handler", From dd10c4939a40305fb021ab3fb62736daf8eecea4 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:07:16 -0400 Subject: [PATCH 10/50] Update api.rst --- docs/source/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api.rst b/docs/source/api.rst index baa8d18f..87c494f1 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -6,3 +6,4 @@ API .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult + :members: From e9747557aae2c749a7ba3e95648879f9f83c7c85 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:09:29 -0400 Subject: [PATCH 11/50] Update advanced.rst --- docs/source/advanced.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index dd8a7179..9d05deed 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -21,19 +21,19 @@ In addition to ``torchrunx.launch``, we provide the ``torchrunx.Launcher`` datac Logging ------- - -Logs are generated at the worker and agent level, and are specified to :mod:`torchrunx.launch` via the ``log_spec`` argument. By default, a :mod:`torchrunx.DefaultLogSpec` is instantiated, causing logs at the worker and agent levels to be logged to files under ``'./logs'``, and the rank 0 worker's output streams are streamed to the launcher ``stdout``. Logs are prefixed with a timestamp by default. Agent logs have the format ``{timestamp}-{agent hostname}.log`` and workers have the format ``{timestamp}-{agent hostname}[{worker local rank}].log``. - -Custom logging classes can be subclassed from the :mod:`torchrunx.LogSpec` class. Any subclass must have a ``get_map`` method returning a dictionary mapping logger names to lists of :mod:`logging.Handler` objects, in order to be passed to :mod:`torchrunx.launch`. The logger names are of the format ``{agent hostname}`` for agents and ``{agent hostname}[{worker local rank}]`` for workers. The :mod:`torchrunx.DefaultLogSpec` maps all the loggers to :mod:`logging.Filehandler` object pointing to the files mentioned in the previous paragraph. It additionally maps the global rank 0 worker to a :mod:`logging.StreamHandler`, which writes logs the launcher's ``stdout`` stream. - -.. autoclass:: torchrunx.LogSpec - :members: - -.. autoclass:: torchrunx.DefaultLogSpec - :members: - -.. - TODO: example log structure +.. + Logs are generated at the worker and agent level, and are specified to :mod:`torchrunx.launch` via the ``log_spec`` argument. By default, a :mod:`torchrunx.DefaultLogSpec` is instantiated, causing logs at the worker and agent levels to be logged to files under ``'./logs'``, and the rank 0 worker's output streams are streamed to the launcher ``stdout``. Logs are prefixed with a timestamp by default. Agent logs have the format ``{timestamp}-{agent hostname}.log`` and workers have the format ``{timestamp}-{agent hostname}[{worker local rank}].log``. + + Custom logging classes can be subclassed from the :mod:`torchrunx.LogSpec` class. Any subclass must have a ``get_map`` method returning a dictionary mapping logger names to lists of :mod:`logging.Handler` objects, in order to be passed to :mod:`torchrunx.launch`. The logger names are of the format ``{agent hostname}`` for agents and ``{agent hostname}[{worker local rank}]`` for workers. The :mod:`torchrunx.DefaultLogSpec` maps all the loggers to :mod:`logging.Filehandler` object pointing to the files mentioned in the previous paragraph. It additionally maps the global rank 0 worker to a :mod:`logging.StreamHandler`, which writes logs the launcher's ``stdout`` stream. + + .. autoclass:: torchrunx.LogSpec + :members: + + .. autoclass:: torchrunx.DefaultLogSpec + :members: + + .. + TODO: example log structure Worker environment ------------------ From e3c0665bdafa8ad40e5005c340efa6b811152e3b Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Wed, 2 Oct 2024 22:25:45 -0400 Subject: [PATCH 12/50] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6ba8cdab..12d0a543 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # torchrunx 🔥 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/torchrunx)](https://github.com/apoorvkh/torchrunx/blob/main/pyproject.toml) +[![PyTorch Version](https://img.shields.io/badge/torch-%3E%3D2.0-orange)](https://github.com/pytorch/pytorch) [![PyPI - Version](https://img.shields.io/pypi/v/torchrunx)](https://pypi.org/project/torchrunx/) ![Tests](https://img.shields.io/github/actions/workflow/status/apoorvkh/torchrunx/.github%2Fworkflows%2Fmain.yml) [![Docs](https://readthedocs.org/projects/torchrunx/badge/?version=stable)](https://torchrunx.readthedocs.io) @@ -16,9 +17,7 @@ By [Apoorv Khandelwal](http://apoorvkh.com) and [Peter Curtin](https://github.co pip install torchrunx ``` -Requires: Linux, Python >= 3.8.1, PyTorch >= 2.0 - -Shared filesystem & SSH access if using multiple machines +**Requires:** Linux. Shared filesystem & SSH access if using multiple machines. ## Minimal example From c48edafe0731674db71e87c1fe687a1c6fccc730 Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Wed, 2 Oct 2024 22:28:04 -0400 Subject: [PATCH 13/50] Update index.rst --- docs/source/index.rst | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 19063776..87c96dee 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,20 +1,2 @@ -Getting Started -=============== - .. include:: ../../README.md :parser: myst_parser.sphinx_ - -Contents --------- - -.. toctree:: - :maxdepth: 2 - - api - advanced - how_it_works - contributing - -.. sidebar-links:: - :github: - :pypi: torchrunx \ No newline at end of file From 66933e712af292f889912206cc9ff59193b96f54 Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Wed, 2 Oct 2024 22:30:20 -0400 Subject: [PATCH 14/50] Update index.rst --- docs/source/index.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 87c96dee..25542fc8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,2 +1,6 @@ .. include:: ../../README.md :parser: myst_parser.sphinx_ + +.. sidebar-links:: + :github: + :pypi: torchrunx From e2536fc07d916374db90f6adb52afb1edfb2122c Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Wed, 2 Oct 2024 22:36:15 -0400 Subject: [PATCH 15/50] Update index.rst --- docs/source/index.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 25542fc8..ca9640a3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,6 +1,14 @@ .. include:: ../../README.md :parser: myst_parser.sphinx_ +.. toctree:: + :maxdepth: 1 + + api + advanced + how_it_works + contributing + .. sidebar-links:: :github: :pypi: torchrunx From f47b49230bfb03944df42b9f69bbcfd9275bd211 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:12:26 -0400 Subject: [PATCH 16/50] Update requirements.txt --- docs/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 30373d03..7c43c2f7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ sphinx==6.2.1 furo myst-parser -sphinx-toolbox \ No newline at end of file +sphinx-toolbox +sphinx-autodoc-typehints From f24f936ff6ef4b6ac4ebcd65d42707a592b5e7dd Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:12:49 -0400 Subject: [PATCH 17/50] Update conf.py --- docs/source/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 2edb7aee..da68aad9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,6 +20,7 @@ 'myst_parser', 'sphinx_toolbox.sidebar_links', 'sphinx_toolbox.github', + "sphinx_autodoc_typehints", ] github_username = 'apoorvkh' @@ -43,4 +44,4 @@ epub_show_urls = 'footnote' # code block syntax highlighting -#pygments_style = 'sphinx' \ No newline at end of file +#pygments_style = 'sphinx' From fec0f40bd35fa5a09bfcba58710bcc8122c3d81c Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:23:18 -0400 Subject: [PATCH 18/50] Update conf.py --- docs/source/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index da68aad9..b4a353f7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,6 +23,8 @@ "sphinx_autodoc_typehints", ] +typehints_defaults = 'comma' + github_username = 'apoorvkh' github_repository = 'torchrunx' From fb35066b07b29ceb2be8fb90e206ea3f94d5a05a Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:26:39 -0400 Subject: [PATCH 19/50] Update launcher.py --- src/torchrunx/launcher.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 879dd011..d6ab2b7a 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -398,6 +398,11 @@ def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[An raise TypeError(msg) def values(self, hostname: str) -> list[Any]: + """ + Get worker return values for host ``hostname``. + + :param hostname: The host to get return values from + """ host_idx = self.hostnames.index(hostname) return self.return_values[host_idx] From 9ecd9976276196b8029c626b107fd33d27b70868 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:36:46 -0400 Subject: [PATCH 20/50] Update conf.py --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index b4a353f7..4304cb8f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,6 +23,7 @@ "sphinx_autodoc_typehints", ] +autodoc_typehints = "none" typehints_defaults = 'comma' github_username = 'apoorvkh' From 1085c6365fbe422824ebd0961e46764e2ddc1216 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:38:58 -0400 Subject: [PATCH 21/50] Update conf.py --- docs/source/conf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4304cb8f..6ea3a8b2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,11 +20,12 @@ 'myst_parser', 'sphinx_toolbox.sidebar_links', 'sphinx_toolbox.github', - "sphinx_autodoc_typehints", + 'sphinx.ext.autodoc.typehints', + #"sphinx_autodoc_typehints", ] -autodoc_typehints = "none" -typehints_defaults = 'comma' +autodoc_typehints = "both" +#typehints_defaults = 'comma' github_username = 'apoorvkh' github_repository = 'torchrunx' From 74be93c5d146aabfbeb2e8c17cc91639da94c422 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:43:17 -0400 Subject: [PATCH 22/50] remove requirement --- docs/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 7c43c2f7..06ac352c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,4 +2,3 @@ sphinx==6.2.1 furo myst-parser sphinx-toolbox -sphinx-autodoc-typehints From afd980803efa73f7289a5bc5aa11d7418ae02683 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:50:18 -0400 Subject: [PATCH 23/50] Update launcher.py --- src/torchrunx/launcher.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index d6ab2b7a..2a04d742 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -389,6 +389,11 @@ def all(self, by: Literal["rank"]) -> list[Any]: pass def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[Any]] | list[Any]: + """ + Get all worker return values. + + :param by: ``"hostname"``: Return a dictionary mapping hostname to local worker return values. ``"rank"``: Return a list of return values sorted by global worker rank. + """ if by == "hostname": return dict(zip(self.hostnames, self.return_values)) elif by == "rank": # noqa: RET505 @@ -407,6 +412,11 @@ def values(self, hostname: str) -> list[Any]: return self.return_values[host_idx] def value(self, rank: int) -> Any: + """ + Get worker return value based on global rank. + + :param rank: Global worker rank to get return value from + """ if rank < 0: msg = f"Rank {rank} must be larger than 0" raise ValueError(msg) From 4b8275297fceb39d2e93a8605434ce12041aba34 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:56:53 -0400 Subject: [PATCH 24/50] Update launcher.py --- src/torchrunx/launcher.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 2a04d742..51144c25 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -382,18 +382,19 @@ def all(self) -> dict[str, list[Any]]: @overload def all(self, by: Literal["hostname"]) -> dict[str, list[Any]]: + """ + Get all worker return values by hostname. + """ pass @overload def all(self, by: Literal["rank"]) -> list[Any]: + """ + Get all worker return values by rank. + """ pass def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[Any]] | list[Any]: - """ - Get all worker return values. - - :param by: ``"hostname"``: Return a dictionary mapping hostname to local worker return values. ``"rank"``: Return a list of return values sorted by global worker rank. - """ if by == "hostname": return dict(zip(self.hostnames, self.return_values)) elif by == "rank": # noqa: RET505 From 7783bc96c0d2ef049a42a48aaa83f5c160c43046 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:07:24 -0400 Subject: [PATCH 25/50] Update api.rst --- docs/source/api.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 87c494f1..a3f69e6a 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -6,4 +6,7 @@ API .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult - :members: + LaunchResult.all(by = "hostname") + LaunchResult.all(by = "rank") + LaunchResult.values + LaunchReslt.value From 5139db2fd25b1989741b499153b4d095430fed4c Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:12:01 -0400 Subject: [PATCH 26/50] Update api.rst --- docs/source/api.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index a3f69e6a..ba0692f3 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -6,7 +6,7 @@ API .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult - LaunchResult.all(by = "hostname") - LaunchResult.all(by = "rank") - LaunchResult.values - LaunchReslt.value + all(by = "hostname") + all(by = "rank") + values + value From fedbd304f1a6bdf507adb1ad05c0cad26f68f6ab Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:16:08 -0400 Subject: [PATCH 27/50] Update api.rst --- docs/source/api.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index ba0692f3..87c494f1 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -6,7 +6,4 @@ API .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult - all(by = "hostname") - all(by = "rank") - values - value + :members: From e507b78c627e2dbbf338375ea7a30174887f0da3 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:18:09 -0400 Subject: [PATCH 28/50] Update launcher.py --- src/torchrunx/launcher.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 51144c25..c9ea86ba 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -382,19 +382,18 @@ def all(self) -> dict[str, list[Any]]: @overload def all(self, by: Literal["hostname"]) -> dict[str, list[Any]]: - """ - Get all worker return values by hostname. - """ pass @overload def all(self, by: Literal["rank"]) -> list[Any]: - """ - Get all worker return values by rank. - """ pass def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[Any]] | list[Any]: + """ + Get all worker return values by rank or hostname. + + :param by: Whether to aggregate all return values by hostname, or just output all of them in order of rank, defaults to "hostname" + """ if by == "hostname": return dict(zip(self.hostnames, self.return_values)) elif by == "rank": # noqa: RET505 From 3d71997e065b9b84afe22813ad125d269945646f Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:25:00 -0400 Subject: [PATCH 29/50] try removing launch types --- src/torchrunx/launcher.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index c9ea86ba..8ebb2082 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -186,7 +186,7 @@ def run( # noqa: C901, PLR0912 :type func_args: tuple[Any] | None :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None :type func_kwargs: dict[str, Any] | None - :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` + :raises RuntimeError | Exception: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output :rtype: LaunchResult """ @@ -334,30 +334,18 @@ def launch( Launch a distributed PyTorch function on the specified nodes. :param func: The distributed function to call on all workers - :type func: Callable :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None - :type func_args: tuple[Any] | None, optional :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None - :type func_kwargs: dict[str, Any] | None, optional :param hostnames: A list of node hostnames to start workers on, or a string indicating whether to use slurm or automatically decide the hosts, defaults to "auto" - :type hostnames: list[str] | Literal["auto", "slurm"], optional :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to auto - :type workers_per_host: int | list[int] | Literal["auto", "slurm"] | None, optional :param ssh_config_file: An SSH configuration file to use when connecting to nodes, defaults to None - :type ssh_config_file: str | os.PathLike | None, optional :param backend: A ``torch.distributed`` `backend string `_. If ``None``, doesn't initialize a process group for you, defaults to "auto" - :type backend: Literal['nccl', 'gloo', 'mpi', 'ucc', 'auto'] | None, optional :param log_handlers: A list of handlers to manage agent and worker logs, or ``"auto"`` to use an automatic basic logging scheme, defaults to "auto" - :type log_handlers: list[Handler] | Literal["auto"] | None, optional :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ("PATH", "LD_LIBRARY", "LIBRARY_PATH", "PYTHON*", "CUDA*", "TORCH*", "PYTORCH*", "NCCL*") - :type env_vars: Tuple[str], optional :param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to None - :type env_file: str | os.PathLike | None, optional :param timeout: Worker process group timeout, defaults to 600 - :type timeout: int, optional - :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` + :raises RuntimeError | Exception: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output - :rtype: LaunchResult """ # noqa: E501 return Launcher( hostnames=hostnames, From 662e8999c03f6806ef7ade702887e3f677dab4b5 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:31:46 -0400 Subject: [PATCH 30/50] touch up launch formatting --- src/torchrunx/launcher.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 8ebb2082..201b8d16 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -334,16 +334,16 @@ def launch( Launch a distributed PyTorch function on the specified nodes. :param func: The distributed function to call on all workers - :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None - :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None - :param hostnames: A list of node hostnames to start workers on, or a string indicating whether to use slurm or automatically decide the hosts, defaults to "auto" - :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to auto - :param ssh_config_file: An SSH configuration file to use when connecting to nodes, defaults to None - :param backend: A ``torch.distributed`` `backend string `_. If ``None``, doesn't initialize a process group for you, defaults to "auto" - :param log_handlers: A list of handlers to manage agent and worker logs, or ``"auto"`` to use an automatic basic logging scheme, defaults to "auto" - :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ("PATH", "LD_LIBRARY", "LIBRARY_PATH", "PYTHON*", "CUDA*", "TORCH*", "PYTORCH*", "NCCL*") - :param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to None - :param timeout: Worker process group timeout, defaults to 600 + :param func_args: Any positional arguments to be provided when calling ``func``, defaults to ``None`` + :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to ``None`` + :param hostnames: A list of node hostnames to start workers on, or a string indicating whether to use slurm or automatically decide the hosts, defaults to ``'auto'`` + :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to ``'auto'`` + :param ssh_config_file: An SSH configuration file to use when connecting to nodes, defaults to ``None`` + :param backend: A ``torch.distributed`` `backend string `_. If ``None``, doesn't initialize a process group for you, defaults to ``'auto'`` + :param log_handlers: A list of handlers to manage agent and worker logs, or ``'auto'`` to use an automatic basic logging scheme, defaults to ``'auto'`` + :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ``('PATH', 'LD_LIBRARY', 'LIBRARY_PATH', 'PYTHON*', 'CUDA*', 'TORCH*', 'PYTORCH*', 'NCCL*')`` + :param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to ``None`` + :param timeout: Worker process group timeout, defaults to ``600`` :raises RuntimeError | Exception: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output """ # noqa: E501 From 1147a71e415e86cebb81022004587406668491c0 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:34:26 -0400 Subject: [PATCH 31/50] Update launcher.py --- src/torchrunx/launcher.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 201b8d16..780e6184 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -181,14 +181,10 @@ def run( # noqa: C901, PLR0912 Launch a distributed PyTorch function on the specified nodes. See :mod:`torchrunx.launch` :param func: The distributed function to call on all workers - :type func: Callable :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None - :type func_args: tuple[Any] | None :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None - :type func_kwargs: dict[str, Any] | None :raises RuntimeError | Exception: May fail due to misconfiguration, or errors thrown by ``func`` :return: A dictionary mapping worker ranks to their output - :rtype: LaunchResult """ if not dist.is_available(): msg = "The torch.distributed package is not available." @@ -380,7 +376,7 @@ def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[An """ Get all worker return values by rank or hostname. - :param by: Whether to aggregate all return values by hostname, or just output all of them in order of rank, defaults to "hostname" + :param by: Whether to aggregate all return values by hostname, or just output all of them in order of rank, defaults to ``'hostname'`` """ if by == "hostname": return dict(zip(self.hostnames, self.return_values)) @@ -401,7 +397,7 @@ def values(self, hostname: str) -> list[Any]: def value(self, rank: int) -> Any: """ - Get worker return value based on global rank. + Get worker return value from global rank ``rank``. :param rank: Global worker rank to get return value from """ From bf7965a6b503a8dd6c36627beee55c8161cabee5 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:39:00 -0400 Subject: [PATCH 32/50] touch up example in readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 12d0a543..b4bdbaab 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ We could also launch multiple functions, with different GPUs: ```python def train_model(model, dataset): - trained_model = train(model, dataset) + trained_model = train(model, train_dataset) if int(os.environ["RANK"]) == 0: torch.save(learned_model, 'model.pt') @@ -96,21 +96,21 @@ def test_model(model_path, test_dataset): ```python import torchrunx as trx -model_path = trx.launch( +learned_model_path = trx.launch( func=train_model, - func_kwargs={'model': my_model, 'training_dataset': mnist_train}, + func_kwargs={'model': my_model, 'train_dataset': mnist_train}, hostnames=["localhost", "other_node"], workers_per_host=2 -)["localhost"][0] # return from rank 0 (first worker on "localhost") +).value(0) # return from rank 0 (first worker on "localhost") accuracy = trx.launch( func=test_model, - func_kwargs={'model': learned_model, 'test_dataset': mnist_test}, + func_kwargs={'model_path': learned_model_path, 'test_dataset': mnist_test}, hostnames=["localhost"], workers_per_host=1 -)["localhost"][0] +).value(0) print(f'Accuracy: {accuracy}') ``` From 2a1329ee089340cbd7e1012ba67bcaea87c23b06 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:42:16 -0400 Subject: [PATCH 33/50] fix first example both examples use the LaunchResult.value functionality now --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b4bdbaab..119df786 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Here's a simple example where we distribute `train_model` to two hosts (with 2 G ```python def train_model(model, dataset): - trained_model = train(model, dataset) + trained_model = train(model, train_dataset) if int(os.environ["RANK"]) == 0: torch.save(learned_model, 'model.pt') @@ -37,12 +37,12 @@ def train_model(model, dataset): ```python import torchrunx as trx -model_path = trx.launch( +learned_model_path = trx.launch( func=train_model, - func_kwargs={'model': my_model, 'training_dataset': mnist_train}, + func_kwargs={'model': my_model, 'train_dataset': mnist_train}, hostnames=["localhost", "other_node"], workers_per_host=2 -)["localhost"][0] # return from rank 0 (first worker on "localhost") +).value(0) # return from rank 0 (first worker on "localhost") ``` ## Why should I use this? From 6768ddbb4b9c391a8f2e89b6578e33387a7983d8 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Fri, 18 Oct 2024 13:44:39 -0400 Subject: [PATCH 34/50] Update launch docs --- pixi.lock | 4 +-- src/torchrunx/__init__.py | 2 +- src/torchrunx/launcher.py | 70 ++++++++++++++------------------------- 3 files changed, 28 insertions(+), 48 deletions(-) diff --git a/pixi.lock b/pixi.lock index e67beb9f..e4fae5ca 100644 --- a/pixi.lock +++ b/pixi.lock @@ -2601,9 +2601,9 @@ packages: requires_python: '>=3.8.0' - kind: pypi name: torchrunx - version: 0.1.4 + version: 0.2.0 path: . - sha256: de986bf47e1c379e4de6b10ca352715d708bb5f9b4cfc8736e9ee592db5fe1ae + sha256: 1753f43bee54bc0da38cdd524dc501c0c2be9fbaaa7036bced9c9d03a7a8e810 requires_dist: - cloudpickle>=3.0.0 - fabric>=3.0.0 diff --git a/src/torchrunx/__init__.py b/src/torchrunx/__init__.py index e9bc8fd1..c5a7d6fd 100644 --- a/src/torchrunx/__init__.py +++ b/src/torchrunx/__init__.py @@ -1,4 +1,4 @@ -from .launcher import Launcher, launch, LaunchResult +from .launcher import Launcher, LaunchResult, launch from .logging_utils import add_filter_to_handler, file_handler, stream_handler __all__ = [ diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 879dd011..8f2d55da 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -157,8 +157,9 @@ class Launcher: workers_per_host: int | list[int] | Literal["auto", "slurm"] = "auto" ssh_config_file: str | os.PathLike | None = None backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None = "auto" + timeout: int = 600 log_handlers: list[Handler] | Literal["auto"] | None = "auto" - env_vars: tuple[str] = ( # pyright: ignore [reportAssignmentType] + default_env_vars: tuple[str] = ( # pyright: ignore [reportAssignmentType] "PATH", "LD_LIBRARY", "LIBRARY_PATH", @@ -168,8 +169,8 @@ class Launcher: "PYTORCH*", "NCCL*", ) + extra_env_vars: tuple[str] = () env_file: str | os.PathLike | None = None - timeout: int = 600 def run( # noqa: C901, PLR0912 self, @@ -177,19 +178,6 @@ def run( # noqa: C901, PLR0912 func_args: tuple[Any] | None = None, func_kwargs: dict[str, Any] | None = None, ) -> LaunchResult: - """ - Launch a distributed PyTorch function on the specified nodes. See :mod:`torchrunx.launch` - - :param func: The distributed function to call on all workers - :type func: Callable - :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None - :type func_args: tuple[Any] | None - :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None - :type func_kwargs: dict[str, Any] | None - :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` - :return: A dictionary mapping worker ranks to their output - :rtype: LaunchResult - """ if not dist.is_available(): msg = "The torch.distributed package is not available." raise RuntimeError(msg) @@ -235,7 +223,7 @@ def run( # noqa: C901, PLR0912 logger_port=log_receiver.port, world_size=world_size, rank=i + 1, - env_vars=self.env_vars, + env_vars=(self.default_env_vars + self.extra_env_vars), env_file=self.env_file, ), hostname=hostname, @@ -316,8 +304,9 @@ def launch( workers_per_host: int | list[int] | Literal["auto", "slurm"] = "auto", ssh_config_file: str | os.PathLike | None = None, backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None = "auto", + timeout: int = 600, log_handlers: list[Handler] | Literal["auto"] | None = "auto", - env_vars: tuple[str] = ( # pyright: ignore [reportArgumentType] + default_env_vars: tuple[str] = ( # pyright: ignore [reportAssignmentType] "PATH", "LD_LIBRARY", "LIBRARY_PATH", @@ -327,47 +316,38 @@ def launch( "PYTORCH*", "NCCL*", ), + extra_env_vars: tuple[str] = (), env_file: str | os.PathLike | None = None, - timeout: int = 600, ) -> LaunchResult: """ Launch a distributed PyTorch function on the specified nodes. - :param func: The distributed function to call on all workers - :type func: Callable - :param func_args: Any positional arguments to be provided when calling ``func``, defaults to None - :type func_args: tuple[Any] | None, optional - :param func_kwargs: Any keyword arguments to be provided when calling ``func``, defaults to None - :type func_kwargs: dict[str, Any] | None, optional - :param hostnames: A list of node hostnames to start workers on, or a string indicating whether to use slurm or automatically decide the hosts, defaults to "auto" - :type hostnames: list[str] | Literal["auto", "slurm"], optional - :param workers_per_host: The number of workers per node. Providing an ``int`` implies all nodes should have ``workers_per_host`` workers, meanwhile providing a list causes node ``i`` to have ``worker_per_host[i]`` workers, defaults to auto - :type workers_per_host: int | list[int] | Literal["auto", "slurm"] | None, optional - :param ssh_config_file: An SSH configuration file to use when connecting to nodes, defaults to None - :type ssh_config_file: str | os.PathLike | None, optional - :param backend: A ``torch.distributed`` `backend string `_. If ``None``, doesn't initialize a process group for you, defaults to "auto" - :type backend: Literal['nccl', 'gloo', 'mpi', 'ucc', 'auto'] | None, optional - :param log_handlers: A list of handlers to manage agent and worker logs, or ``"auto"`` to use an automatic basic logging scheme, defaults to "auto" - :type log_handlers: list[Handler] | Literal["auto"] | None, optional - :param env_vars: A list of environmental variables to be copied from the launcher environment to workers. Allows for bash pattern matching syntax, defaults to ("PATH", "LD_LIBRARY", "LIBRARY_PATH", "PYTHON*", "CUDA*", "TORCH*", "PYTORCH*", "NCCL*") - :type env_vars: Tuple[str], optional - :param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to None - :type env_file: str | os.PathLike | None, optional - :param timeout: Worker process group timeout, defaults to 600 - :type timeout: int, optional - :raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func`` - :return: A dictionary mapping worker ranks to their output - :rtype: LaunchResult + :param func: + :param func_args: + :param func_kwargs: + :param hostnames: Nodes to launch the function on. Default (`"auto"`) infers from a SLURM environment or runs on localhost. + :param workers_per_host: Number of processes to run per node. Can define per node with :type:`list[int]`. + :param ssh_config_file: For connecting to nodes, defaults to `~/.ssh/config` or `/etc/ssh/ssh_config`. + :param backend: A `torch.distributed.Backend `_ string. Default uses NCCL if GPUs available, else GLOO. `None` does not initialize a process group. + :param timeout: Worker process group timeout (seconds). + :param log_handlers: A list of handlers to manage agent and worker logs. Default (`"auto"`) uses an automatic basic logging scheme. + :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. + :param extra_env_vars: Additional, user-specified variables to copy. + :param env_file: A file (like `.env`) with additional environment variables to copy. + :raises RuntimeError: May fail if `torch.distributed` not available or communication timeout between nodes + :raises Exception: Propagates exceptions raised in worker processes + :return: Objects returned from every worker """ # noqa: E501 return Launcher( hostnames=hostnames, workers_per_host=workers_per_host, ssh_config_file=ssh_config_file, backend=backend, + timeout=timeout, log_handlers=log_handlers, - env_vars=env_vars, + default_env_vars=default_env_vars, + extra_env_vars=extra_env_vars, env_file=env_file, - timeout=timeout, ).run(func=func, func_args=func_args, func_kwargs=func_kwargs) From 1fd6ddbbaba7abd4fd3a76afebcb651938b64386 Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:46:02 -0400 Subject: [PATCH 35/50] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 119df786..d8d8d126 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Why not? ## More complicated example -We could also launch multiple functions, with different GPUs: +We could also launch multiple functions, on different nodes: ```python def train_model(model, dataset): @@ -99,9 +99,9 @@ import torchrunx as trx learned_model_path = trx.launch( func=train_model, func_kwargs={'model': my_model, 'train_dataset': mnist_train}, - hostnames=["localhost", "other_node"], + hostnames=["beefy-node"], workers_per_host=2 -).value(0) # return from rank 0 (first worker on "localhost") +).value(0) # return from rank 0 (first worker on "beefy-node") From 6357e85b7af57cbc0b6e3402732cef679b23d62d Mon Sep 17 00:00:00 2001 From: Peter Curtin <98424367+pmcurtin@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:00:22 -0400 Subject: [PATCH 36/50] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d8d8d126..67d43ddc 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ pip install torchrunx Here's a simple example where we distribute `train_model` to two hosts (with 2 GPUs each): ```python -def train_model(model, dataset): +def train_model(model, train_dataset): trained_model = train(model, train_dataset) if int(os.environ["RANK"]) == 0: @@ -78,7 +78,7 @@ Why not? We could also launch multiple functions, on different nodes: ```python -def train_model(model, dataset): +def train_model(model, train_dataset): trained_model = train(model, train_dataset) if int(os.environ["RANK"]) == 0: From fc6e3cb3aa58293d64fd54ff9fe0e432d663a17e Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Fri, 18 Oct 2024 14:00:29 -0400 Subject: [PATCH 37/50] fixing quotes --- src/torchrunx/launcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 7c430a48..c3e8ca1d 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -330,14 +330,14 @@ def launch( :param func_kwargs: :param hostnames: Nodes to launch the function on. Default infers from a SLURM environment or runs on localhost. :param workers_per_host: Number of processes to run per node. Can define per node with :type:`list[int]`. - :param ssh_config_file: An SSH configuration file for connecting to nodes, by default loads `~/.ssh/config` or `/etc/ssh/ssh_config`. - :param backend: `Backend `_ to initialize worker process group with. Default uses NCCL (if GPUs available) or GLOO. Disabled by `None`. + :param ssh_config_file: An SSH configuration file for connecting to nodes, by default loads ``~/.ssh/config`` or ``/etc/ssh/ssh_config``. + :param backend: `Backend `_ to initialize worker process group with. Default uses NCCL (if GPUs available) or GLOO. Disabled by ``None``. :param timeout: Worker process group timeout (seconds). :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme. :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. :param extra_env_vars: Additional, user-specified variables to copy. - :param env_file: A file (like `.env`) with additional environment variables to copy. - :raises RuntimeError: May fail if `torch.distributed` not available or communication timeout between nodes + :param env_file: A file (like ``.env``) with additional environment variables to copy. + :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes :raises Exception: Propagates exceptions raised in worker processes :return: Objects returned from every worker """ # noqa: E501 From fe57ba255c04f889da0bb25b159b6a1bfbbe7d24 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Fri, 18 Oct 2024 14:03:39 -0400 Subject: [PATCH 38/50] remove return type --- src/torchrunx/launcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index c3e8ca1d..748f92bf 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -339,7 +339,6 @@ def launch( :param env_file: A file (like ``.env``) with additional environment variables to copy. :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes :raises Exception: Propagates exceptions raised in worker processes - :return: Objects returned from every worker """ # noqa: E501 return Launcher( hostnames=hostnames, From 67827189e68eaf91ef471e397fc70d6e68fb1814 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Fri, 18 Oct 2024 14:13:31 -0400 Subject: [PATCH 39/50] moved complicated example to advanced --- README.md | 42 --------------------------------------- docs/source/advanced.rst | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 67d43ddc..febe4738 100644 --- a/README.md +++ b/README.md @@ -72,45 +72,3 @@ Features: Why not? - We don't support fault tolerance via torch elastic. Probably only useful if you are using 1000 GPUs. Maybe someone can make a PR. - -## More complicated example - -We could also launch multiple functions, on different nodes: - -```python -def train_model(model, train_dataset): - trained_model = train(model, train_dataset) - - if int(os.environ["RANK"]) == 0: - torch.save(learned_model, 'model.pt') - return 'model.pt' - - return None - -def test_model(model_path, test_dataset): - model = torch.load(model_path) - accuracy = inference(model, test_dataset) - return accuracy -``` - -```python -import torchrunx as trx - -learned_model_path = trx.launch( - func=train_model, - func_kwargs={'model': my_model, 'train_dataset': mnist_train}, - hostnames=["beefy-node"], - workers_per_host=2 -).value(0) # return from rank 0 (first worker on "beefy-node") - - - -accuracy = trx.launch( - func=test_model, - func_kwargs={'model_path': learned_model_path, 'test_dataset': mnist_test}, - hostnames=["localhost"], - workers_per_host=1 -).value(0) - -print(f'Accuracy: {accuracy}') -``` diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 43780fe4..b33b1bc3 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -1,6 +1,49 @@ Advanced Usage ============== +Multiple functions in one script +-------------------------------- + +We could also launch multiple functions, on different nodes: + +.. code-block:: python + + def train_model(model, train_dataset): + trained_model = train(model, train_dataset) + + if int(os.environ["RANK"]) == 0: + torch.save(learned_model, 'model.pt') + return 'model.pt' + + return None + + def test_model(model_path, test_dataset): + model = torch.load(model_path) + accuracy = inference(model, test_dataset) + return accuracy + +.. code-block:: python + + import torchrunx as trx + + learned_model_path = trx.launch( + func=train_model, + func_kwargs={'model': my_model, 'train_dataset': mnist_train}, + hostnames=["beefy-node"], + workers_per_host=2 + ).value(0) # return from rank 0 + + accuracy = trx.launch( + func=test_model, + func_kwargs={'model_path': learned_model_path, 'test_dataset': mnist_test}, + hostnames=["localhost"], + workers_per_host=1 + ).value(0) + + print(f'Accuracy: {accuracy}') + + + Environment Detection --------------------- From f1d2ec2bfff53fc88cd08438e896d718b5f85b3b Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Fri, 18 Oct 2024 14:52:14 -0400 Subject: [PATCH 40/50] test --- src/torchrunx/launcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 748f92bf..c0f3b7dd 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -171,9 +171,6 @@ class Launcher: ) extra_env_vars: tuple[str] = () env_file: str | os.PathLike | None = None - """ - Alias class for :mod:`torchrunx.launch` - """ def run( # noqa: C901, PLR0912 self, @@ -339,6 +336,9 @@ def launch( :param env_file: A file (like ``.env``) with additional environment variables to copy. :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes :raises Exception: Propagates exceptions raised in worker processes + + :ivar param1: The first parameter. + :vartype param1: int """ # noqa: E501 return Launcher( hostnames=hostnames, From dee60b37f28962bf05e1bbe78bdc6a69a7a388fc Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 13:48:27 -0400 Subject: [PATCH 41/50] update readme example --- README.md | 75 ++++++++++++++++++++++----------------- src/torchrunx/agent.py | 2 +- src/torchrunx/launcher.py | 35 +++++++++--------- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index febe4738..63c5230a 100644 --- a/README.md +++ b/README.md @@ -19,43 +19,14 @@ pip install torchrunx **Requires:** Linux. Shared filesystem & SSH access if using multiple machines. -## Minimal example - -Here's a simple example where we distribute `train_model` to two hosts (with 2 GPUs each): - -```python -def train_model(model, train_dataset): - trained_model = train(model, train_dataset) - - if int(os.environ["RANK"]) == 0: - torch.save(learned_model, 'model.pt') - return 'model.pt' - - return None -``` - -```python -import torchrunx as trx - -learned_model_path = trx.launch( - func=train_model, - func_kwargs={'model': my_model, 'train_dataset': mnist_train}, - hostnames=["localhost", "other_node"], - workers_per_host=2 -).value(0) # return from rank 0 (first worker on "localhost") -``` - ## Why should I use this? -[`torchrun`](https://pytorch.org/docs/stable/elastic/run.html) is a hammer. `torchrunx` is a chisel. - Whether you have 1 GPU, 8 GPUs, or 8 machines: Convenience: - If you don't want to set up [`dist.init_process_group`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) yourself -- If you want to run `python myscript.py` instead of `torchrun myscript.py` -- If you don't want to manually SSH and run `torchrun --master-ip --master-port ...` on every machine (and if you don't want to babysit these machines for hanging failures) +- If you don't want to manually SSH into every machine (and `torchrun --master-ip --master-port ...` and babysit hanging failures) Robustness: @@ -66,9 +37,47 @@ Robustness: Features: - Our launch utility is super _Pythonic_ -- If you want to run distributed PyTorch functions from Python Notebooks. + - Return objects from your distributed functions + - Run `python script.py` instead of `torchrun script.py` + - Launch functions from Python Notebooks +- Fine-grained control over logging, environment variables, exception handling, etc. - Automatic integration with SLURM -Why not? +## Minimal example + +Here's a simple example where we "train" a model on two nodes (with 2 GPUs each). You can also use `transformers.Trainer` (or similar) which handles all the multi-GPU (DDP) code for you. + +```python +import os +import torch + +def train(): + rank = int(os.environ['RANK']) + local_rank = int(os.environ['LOCAL_RANK']) + + model = torch.nn.Linear(10, 10).to(local_rank) + ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) + + optimizer = torch.optim.AdamW(ddp_model.parameters()) + optimizer.zero_grad() + outputs = ddp_model(torch.randn(5, 10)) + labels = torch.randn(5, 10).to(local_rank) + torch.nn.functional.mse_loss(outputs, labels).backward() + optimizer.step() -- We don't support fault tolerance via torch elastic. Probably only useful if you are using 1000 GPUs. Maybe someone can make a PR. + if rank == 0: + return model +``` + +```python +import torchrunx as trx + +if __name__ == "__main__": + trained_model = trx.launch( + func=train, + hostnames=["localhost", "other_node"], + workers_per_host=2 + ).value(rank=0) + + torch.save(trained_model.state_dict(), "model.pth") +``` diff --git a/src/torchrunx/agent.py b/src/torchrunx/agent.py index 789af155..1860e444 100644 --- a/src/torchrunx/agent.py +++ b/src/torchrunx/agent.py @@ -73,7 +73,7 @@ def entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | WorkerExce os.environ["WORLD_SIZE"] = str(worker_args.world_size) os.environ["MASTER_ADDR"] = worker_args.main_agent_hostname os.environ["MASTER_PORT"] = str(worker_args.main_agent_port) - + if worker_args.backend is not None: backend = worker_args.backend if backend == "auto": diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index c0f3b7dd..5eba3403 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -320,25 +320,22 @@ def launch( env_file: str | os.PathLike | None = None, ) -> LaunchResult: """ - Launch a distributed PyTorch function on the specified nodes. - - :param func: - :param func_args: - :param func_kwargs: - :param hostnames: Nodes to launch the function on. Default infers from a SLURM environment or runs on localhost. - :param workers_per_host: Number of processes to run per node. Can define per node with :type:`list[int]`. - :param ssh_config_file: An SSH configuration file for connecting to nodes, by default loads ``~/.ssh/config`` or ``/etc/ssh/ssh_config``. - :param backend: `Backend `_ to initialize worker process group with. Default uses NCCL (if GPUs available) or GLOO. Disabled by ``None``. - :param timeout: Worker process group timeout (seconds). - :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme. - :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. - :param extra_env_vars: Additional, user-specified variables to copy. - :param env_file: A file (like ``.env``) with additional environment variables to copy. - :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes - :raises Exception: Propagates exceptions raised in worker processes - - :ivar param1: The first parameter. - :vartype param1: int + Launch a distributed PyTorch function on the specified nodes. + + :param func: + :param func_args: + :param func_kwargs: + :param hostnames: Nodes to launch the function on. Default infers from a SLURM environment or runs on localhost. + :param workers_per_host: Number of processes to run per node. Can define per node with :type:`list[int]`. + :param ssh_config_file: An SSH configuration file for connecting to nodes, by default loads ``~/.ssh/config`` or ``/etc/ssh/ssh_config``. + :param backend: `Backend `_ to initialize worker process group with. Default uses NCCL (if GPUs available) or GLOO. Disabled by ``None``. + :param timeout: Worker process group timeout (seconds). + :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme. + :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. + :param extra_env_vars: Additional, user-specified variables to copy. + :param env_file: A file (like ``.env``) with additional environment variables to copy. + :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes + :raises Exception: Propagates exceptions raised in worker processes """ # noqa: E501 return Launcher( hostnames=hostnames, From 3a55e793b81db277665ffaf8fa25dd5d5d9077a9 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 13:58:56 -0400 Subject: [PATCH 42/50] update readme --- src/torchrunx/launcher.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 5eba3403..4203b4f3 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -82,7 +82,7 @@ def build_launch_command( logger_port: int, world_size: int, rank: int, - env_vars: list[str] | tuple[str], + env_vars: tuple[str, ...], env_file: str | os.PathLike | None, ) -> str: # shlex.quote prevents shell injection here (resolves S602 in execute_command) @@ -159,7 +159,7 @@ class Launcher: backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None = "auto" timeout: int = 600 log_handlers: list[Handler] | Literal["auto"] | None = "auto" - default_env_vars: tuple[str] = ( # pyright: ignore [reportAssignmentType] + default_env_vars: tuple[str, ...] = ( "PATH", "LD_LIBRARY", "LIBRARY_PATH", @@ -169,7 +169,7 @@ class Launcher: "PYTORCH*", "NCCL*", ) - extra_env_vars: tuple[str] = () + extra_env_vars: tuple[str, ...] = () env_file: str | os.PathLike | None = None def run( # noqa: C901, PLR0912 @@ -306,7 +306,7 @@ def launch( backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None = "auto", timeout: int = 600, log_handlers: list[Handler] | Literal["auto"] | None = "auto", - default_env_vars: tuple[str] = ( # pyright: ignore [reportAssignmentType] + default_env_vars: tuple[str, ...] = ( "PATH", "LD_LIBRARY", "LIBRARY_PATH", @@ -316,7 +316,7 @@ def launch( "PYTORCH*", "NCCL*", ), - extra_env_vars: tuple[str] = (), + extra_env_vars: tuple[str, ...] = (), env_file: str | os.PathLike | None = None, ) -> LaunchResult: """ @@ -371,7 +371,8 @@ def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[An """ Get all worker return values by rank or hostname. - :param by: Whether to aggregate all return values by hostname, or just output all of them in order of rank, defaults to ``'hostname'`` + :param by: Whether to aggregate all return values by hostname, or just output all of them \ + in order of rank, defaults to ``'hostname'`` """ if by == "hostname": return dict(zip(self.hostnames, self.return_values)) From c0ea3551d522afa80f554f0fd77412ca674e5a39 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 14:06:33 -0400 Subject: [PATCH 43/50] update readme --- README.md | 85 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 63c5230a..5a2ce938 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,52 @@ pip install torchrunx **Requires:** Linux. Shared filesystem & SSH access if using multiple machines. +## Minimal example + +Here's a simple example where we "train" a model on two nodes (with 2 GPUs each). You can also use `transformers.Trainer` (or similar) which handles all the multi-GPU (DDP) code for you. + +
+ Training code + + ```python + import os + import torch + + def train(): + rank = int(os.environ['RANK']) + local_rank = int(os.environ['LOCAL_RANK']) + + model = torch.nn.Linear(10, 10).to(local_rank) + ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) + + optimizer = torch.optim.AdamW(ddp_model.parameters()) + optimizer.zero_grad() + outputs = ddp_model(torch.randn(5, 10)) + labels = torch.randn(5, 10).to(local_rank) + torch.nn.functional.mse_loss(outputs, labels).backward() + optimizer.step() + + if rank == 0: + return model + ``` +
+ + +```python +import torchrunx as trx + +if __name__ == "__main__": + trained_model = trx.launch( + func=train, + hostnames=["localhost", "other_node"], + workers_per_host=2 + ).value(rank=0) + + torch.save(trained_model.state_dict(), "model.pth") +``` + +## [Advanced Usage](https://torchrunx.readthedocs.io/stable/advanced.html) + ## Why should I use this? Whether you have 1 GPU, 8 GPUs, or 8 machines: @@ -42,42 +88,3 @@ Features: - Launch functions from Python Notebooks - Fine-grained control over logging, environment variables, exception handling, etc. - Automatic integration with SLURM - -## Minimal example - -Here's a simple example where we "train" a model on two nodes (with 2 GPUs each). You can also use `transformers.Trainer` (or similar) which handles all the multi-GPU (DDP) code for you. - -```python -import os -import torch - -def train(): - rank = int(os.environ['RANK']) - local_rank = int(os.environ['LOCAL_RANK']) - - model = torch.nn.Linear(10, 10).to(local_rank) - ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) - - optimizer = torch.optim.AdamW(ddp_model.parameters()) - optimizer.zero_grad() - outputs = ddp_model(torch.randn(5, 10)) - labels = torch.randn(5, 10).to(local_rank) - torch.nn.functional.mse_loss(outputs, labels).backward() - optimizer.step() - - if rank == 0: - return model -``` - -```python -import torchrunx as trx - -if __name__ == "__main__": - trained_model = trx.launch( - func=train, - hostnames=["localhost", "other_node"], - workers_per_host=2 - ).value(rank=0) - - torch.save(trained_model.state_dict(), "model.pth") -``` From 9332adf869012b500bf99fea56721977dbed12bd Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 14:45:26 -0400 Subject: [PATCH 44/50] readme updates --- README.md | 44 ++++++++++++++++++++++--------------------- docs/source/index.rst | 1 + 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 5a2ce938..b1cdddb9 100644 --- a/README.md +++ b/README.md @@ -17,15 +17,17 @@ By [Apoorv Khandelwal](http://apoorvkh.com) and [Peter Curtin](https://github.co pip install torchrunx ``` -**Requires:** Linux. Shared filesystem & SSH access if using multiple machines. +**Requires:** Linux (with shared filesystem & SSH access if using multiple machines) -## Minimal example +## Demo -Here's a simple example where we "train" a model on two nodes (with 2 GPUs each). You can also use `transformers.Trainer` (or similar) which handles all the multi-GPU (DDP) code for you. +Here's a simple example where we "train" a model on two nodes (with 2 GPUs each).
Training code + You could also use `transformers.Trainer` (or similar) below to automatically handle all the multi-GPU / DDP code. + ```python import os import torch @@ -36,8 +38,8 @@ Here's a simple example where we "train" a model on two nodes (with 2 GPUs each) model = torch.nn.Linear(10, 10).to(local_rank) ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) - optimizer = torch.optim.AdamW(ddp_model.parameters()) + optimizer.zero_grad() outputs = ddp_model(torch.randn(5, 10)) labels = torch.randn(5, 10).to(local_rank) @@ -57,34 +59,34 @@ if __name__ == "__main__": trained_model = trx.launch( func=train, hostnames=["localhost", "other_node"], - workers_per_host=2 - ).value(rank=0) + workers_per_host=2 # num. GPUs + ).value(rank=0) # get returned object torch.save(trained_model.state_dict(), "model.pth") ``` -## [Advanced Usage](https://torchrunx.readthedocs.io/stable/advanced.html) +### [Full API](https://torchrunx.readthedocs.io/stable/api.html) +### [Advanced Usage](https://torchrunx.readthedocs.io/stable/advanced.html) ## Why should I use this? -Whether you have 1 GPU, 8 GPUs, or 8 machines: +Whether you have 1 GPU, 8 GPUs, or 8 machines. -Convenience: - -- If you don't want to set up [`dist.init_process_group`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) yourself -- If you don't want to manually SSH into every machine (and `torchrun --master-ip --master-port ...` and babysit hanging failures) +- Our `launch()` utility is super _Pythonic_ + - Return objects from your workers + - Run `python script.py` instead of `torchrun script.py` + - Launch multi-node functions, even from Python Notebooks +- Fine-grained control over logging, environment variables, exception handling, etc. +- Automatic integration with SLURM Robustness: -- If you want to run a complex, _modular_ workflow in one script +- If you want to run a complex, _modular_ workflow in __one__ script + - don't parallelize your entire script: just the functions you want! - no worries about memory leaks or OS failures - - don't parallelize your entire script: just the functions you want -Features: +Convenience: -- Our launch utility is super _Pythonic_ - - Return objects from your distributed functions - - Run `python script.py` instead of `torchrun script.py` - - Launch functions from Python Notebooks -- Fine-grained control over logging, environment variables, exception handling, etc. -- Automatic integration with SLURM +- If you don't want to: + - set up [`dist.init_process_group`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) yourself + - manually SSH into every machine and `torchrun --master-ip --master-port ...`, babysit failed processes, etc. diff --git a/docs/source/index.rst b/docs/source/index.rst index ca9640a3..55900595 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,6 +2,7 @@ :parser: myst_parser.sphinx_ .. toctree:: + :hidden: :maxdepth: 1 api From 82f576f6885d4aae16dbe70e23f5b14f0f0e44ad Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 15:31:53 -0400 Subject: [PATCH 45/50] test --- README.md | 6 +++--- docs/source/api.rst | 2 -- src/torchrunx/launcher.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b1cdddb9..7c27c932 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,6 @@ Here's a simple example where we "train" a model on two nodes (with 2 GPUs each)
Training code - You could also use `transformers.Trainer` (or similar) below to automatically handle all the multi-GPU / DDP code. - ```python import os import torch @@ -49,6 +47,8 @@ Here's a simple example where we "train" a model on two nodes (with 2 GPUs each) if rank == 0: return model ``` + + You could also use `transformers.Trainer` (or similar) to automatically handle all the multi-GPU / DDP code above.
@@ -72,7 +72,7 @@ if __name__ == "__main__": Whether you have 1 GPU, 8 GPUs, or 8 machines. -- Our `launch()` utility is super _Pythonic_ +- Our [`launch()`](https://torchrunx.readthedocs.io/stable/api.html#torchrunx.launch) utility is super _Pythonic_ - Return objects from your workers - Run `python script.py` instead of `torchrun script.py` - Launch multi-node functions, even from Python Notebooks diff --git a/docs/source/api.rst b/docs/source/api.rst index 87c494f1..608f5b34 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -1,8 +1,6 @@ API ============= -.. TODO: examples - .. autofunction:: torchrunx.launch(func: Callable, ...) .. autoclass:: torchrunx.LaunchResult diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 4203b4f3..49167820 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -333,7 +333,7 @@ def launch( :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme. :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. :param extra_env_vars: Additional, user-specified variables to copy. - :param env_file: A file (like ``.env``) with additional environment variables to copy. + :attribute env_file: A file (like ``.env``) with additional environment variables to copy. :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes :raises Exception: Propagates exceptions raised in worker processes """ # noqa: E501 From cb182edcf923206756fd16ac49d0c2273a55c742 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 15:42:17 -0400 Subject: [PATCH 46/50] misc readthedocs --- docs/.readthedocs.yaml | 2 +- docs/{source => }/conf.py | 0 src/torchrunx/launcher.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename docs/{source => }/conf.py (100%) diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml index b6e3f074..ad720d0a 100644 --- a/docs/.readthedocs.yaml +++ b/docs/.readthedocs.yaml @@ -6,7 +6,7 @@ build: python: "3.8" sphinx: - configuration: docs/source/conf.py + configuration: docs/conf.py python: install: diff --git a/docs/source/conf.py b/docs/conf.py similarity index 100% rename from docs/source/conf.py rename to docs/conf.py diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py index 49167820..4203b4f3 100644 --- a/src/torchrunx/launcher.py +++ b/src/torchrunx/launcher.py @@ -333,7 +333,7 @@ def launch( :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme. :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax. :param extra_env_vars: Additional, user-specified variables to copy. - :attribute env_file: A file (like ``.env``) with additional environment variables to copy. + :param env_file: A file (like ``.env``) with additional environment variables to copy. :raises RuntimeError: May fail if ``torch.distributed`` not available or communication timeout between nodes :raises Exception: Propagates exceptions raised in worker processes """ # noqa: E501 From c6b7e99c846fd28c8383589c1f323c633877b4f1 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 15:45:28 -0400 Subject: [PATCH 47/50] docs fix --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 6ea3a8b2..2ed08aad 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,7 @@ import os import sys -sys.path.insert(0, os.path.abspath('../../src')) +sys.path.insert(0, os.path.abspath('../src')) # Configuration file for the Sphinx documentation builder. From 3f4e0251ad40929d485eda12ddd5cc9c27b0c030 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 15:47:30 -0400 Subject: [PATCH 48/50] fix incorrect imports --- docs/source/advanced.rst | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index b33b1bc3..c98c34f3 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -104,15 +104,9 @@ For example, the `python ... --help` command will then result in: Custom Logging -------------- -Logs are generated at the worker and agent level, and are specified to :mod:`torchrunx.launch` via the ``log_spec`` argument. By default, a :mod:`torchrunx.DefaultLogSpec` is instantiated, causing logs at the worker and agent levels to be logged to files under ``'./logs'``, and the rank 0 worker's output streams are streamed to the launcher ``stdout``. Logs are prefixed with a timestamp by default. Agent logs have the format ``{timestamp}-{agent hostname}.log`` and workers have the format ``{timestamp}-{agent hostname}[{worker local rank}].log``. +Logs are generated at the worker and agent level, and are specified to :mod:`torchrunx.launch` via the ``log_spec`` argument. By default, a is instantiated, causing logs at the worker and agent levels to be logged to files under ``'./logs'``, and the rank 0 worker's output streams are streamed to the launcher ``stdout``. Logs are prefixed with a timestamp by default. Agent logs have the format ``{timestamp}-{agent hostname}.log`` and workers have the format ``{timestamp}-{agent hostname}[{worker local rank}].log``. -Custom logging classes can be subclassed from the :mod:`torchrunx.LogSpec` class. Any subclass must have a ``get_map`` method returning a dictionary mapping logger names to lists of :mod:`logging.Handler` objects, in order to be passed to :mod:`torchrunx.launch`. The logger names are of the format ``{agent hostname}`` for agents and ``{agent hostname}[{worker local rank}]`` for workers. The :mod:`torchrunx.DefaultLogSpec` maps all the loggers to :mod:`logging.Filehandler` object pointing to the files mentioned in the previous paragraph. It additionally maps the global rank 0 worker to a :mod:`logging.StreamHandler`, which writes logs the launcher's ``stdout`` stream. - -.. autoclass:: torchrunx.LogSpec - :members: - -.. autoclass:: torchrunx.DefaultLogSpec - :members: +Custom logging classes can be subclassed from the class. Any subclass must have a ``get_map`` method returning a dictionary mapping logger names to lists of :mod:`logging.Handler` objects, in order to be passed to :mod:`torchrunx.launch`. The logger names are of the format ``{agent hostname}`` for agents and ``{agent hostname}[{worker local rank}]`` for workers. The maps all the loggers to :mod:`logging.Filehandler` object pointing to the files mentioned in the previous paragraph. It additionally maps the global rank 0 worker to a :mod:`logging.StreamHandler`, which writes logs the launcher's ``stdout`` stream. Propagating Exceptions ---------------------- From 15d0c0eba9d333a186b8bc75116add514463f5c2 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 15:50:06 -0400 Subject: [PATCH 49/50] fix readthedocs --- docs/.readthedocs.yaml | 2 +- docs/{ => source}/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename docs/{ => source}/conf.py (95%) diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml index ad720d0a..b6e3f074 100644 --- a/docs/.readthedocs.yaml +++ b/docs/.readthedocs.yaml @@ -6,7 +6,7 @@ build: python: "3.8" sphinx: - configuration: docs/conf.py + configuration: docs/source/conf.py python: install: diff --git a/docs/conf.py b/docs/source/conf.py similarity index 95% rename from docs/conf.py rename to docs/source/conf.py index 2ed08aad..6ea3a8b2 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -1,7 +1,7 @@ import os import sys -sys.path.insert(0, os.path.abspath('../src')) +sys.path.insert(0, os.path.abspath('../../src')) # Configuration file for the Sphinx documentation builder. From ebf1a14079545071cbf8fd2e92ca8dd3909d8cb3 Mon Sep 17 00:00:00 2001 From: apoorvkh Date: Sat, 19 Oct 2024 16:04:43 -0400 Subject: [PATCH 50/50] more docs --- README.md | 6 ++++-- docs/source/advanced.rst | 36 ++++++++++-------------------------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 7c27c932..e3271e91 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ if __name__ == "__main__": Whether you have 1 GPU, 8 GPUs, or 8 machines. +__Features:__ + - Our [`launch()`](https://torchrunx.readthedocs.io/stable/api.html#torchrunx.launch) utility is super _Pythonic_ - Return objects from your workers - Run `python script.py` instead of `torchrun script.py` @@ -79,13 +81,13 @@ Whether you have 1 GPU, 8 GPUs, or 8 machines. - Fine-grained control over logging, environment variables, exception handling, etc. - Automatic integration with SLURM -Robustness: +__Robustness:__ - If you want to run a complex, _modular_ workflow in __one__ script - don't parallelize your entire script: just the functions you want! - no worries about memory leaks or OS failures -Convenience: +__Convenience:__ - If you don't want to: - set up [`dist.init_process_group`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) yourself diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index c98c34f3..21efff64 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -4,44 +4,28 @@ Advanced Usage Multiple functions in one script -------------------------------- -We could also launch multiple functions, on different nodes: - -.. code-block:: python - - def train_model(model, train_dataset): - trained_model = train(model, train_dataset) - - if int(os.environ["RANK"]) == 0: - torch.save(learned_model, 'model.pt') - return 'model.pt' - - return None - - def test_model(model_path, test_dataset): - model = torch.load(model_path) - accuracy = inference(model, test_dataset) - return accuracy +We could also launch multiple functions (e.g. train on many GPUs, test on one GPU): .. code-block:: python import torchrunx as trx - learned_model_path = trx.launch( - func=train_model, - func_kwargs={'model': my_model, 'train_dataset': mnist_train}, - hostnames=["beefy-node"], - workers_per_host=2 - ).value(0) # return from rank 0 + trained_model = trx.launch( + func=train, + hostnames=["node1", "node2"], + workers_per_host=8 + ).value(rank=0) accuracy = trx.launch( - func=test_model, - func_kwargs={'model_path': learned_model_path, 'test_dataset': mnist_test}, + func=test, + func_kwargs={'model': model}, hostnames=["localhost"], workers_per_host=1 - ).value(0) + ).value(rank=0) print(f'Accuracy: {accuracy}') +``trx.launch()`` is self-cleaning: all processes are terminated (and the used memory is completely released) after each invocation. Environment Detection