Replace RRef with RemoteModule (#1513)

Yi Wang · holly1238 · web-flow · commit f2aee2323c67 · 2021-05-10T06:39:07.000-07:00
* Replace RRef with RemoteModule Replace `emb_rref` with `remote_emb_module`. Copied from: https://github.com/pytorch/examples/blob/fba20d59beabc144666123db10c907f88e4744b8/distributed/rpc/ddp_rpc/main.py * Update main.py Add some marks needed by https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html For example: ``` .. literalinclude:: ../advanced_source/rpc_ddp_tutorial/main.py :language: py :start-after: BEGIN hybrid_model :end-before: END hybrid_model ``` This requires the comments of `BEGIN hybrid_model` and `END hybrid_model` Co-authored-by: Holly Sweeney <77758406+holly1238@users.noreply.github.com>
diff --git a/advanced_source/rpc_ddp_tutorial/main.py b/advanced_source/rpc_ddp_tutorial/main.py
@@ -1,16 +1,15 @@
-import os
-from functools import wraps
-
 import random
+
 import torch
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
 import torch.distributed.rpc as rpc
-from torch.distributed.rpc import TensorPipeRpcBackendOptions
 import torch.multiprocessing as mp
 import torch.optim as optim
+from torch.distributed.nn import RemoteModule
 from torch.distributed.optim import DistributedOptimizer
 from torch.distributed.rpc import RRef
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 NUM_EMBEDDINGS = 100
@@ -19,35 +18,25 @@
 # BEGIN hybrid_model
 class HybridModel(torch.nn.Module):
     r"""
-    The model consists of a sparse part and a dense part. The dense part is an
-    nn.Linear module that is replicated across all trainers using
-    DistributedDataParallel. The sparse part is an nn.EmbeddingBag that is
-    stored on the parameter server.
-
-    The model holds a Remote Reference to the embedding table on the parameter
-    server.
+    The model consists of a sparse part and a dense part.
+    1) The dense part is an nn.Linear module that is replicated across all trainers using DistributedDataParallel.
+    2) The sparse part is a Remote Module that holds an nn.EmbeddingBag on the parameter server.
+    This remote model can get a Remote Reference to the embedding table on the parameter server.
     """
 
-    def __init__(self, emb_rref, device):
+    def __init__(self, remote_emb_module, device):
         super(HybridModel, self).__init__()
-        self.emb_rref = emb_rref
+        self.remote_emb_module = remote_emb_module
         self.fc = DDP(torch.nn.Linear(16, 8).cuda(device), device_ids=[device])
         self.device = device
 
     def forward(self, indices, offsets):
-        emb_lookup = self.emb_rref.rpc_sync().forward(indices, offsets)
+        emb_lookup = self.remote_emb_module.forward(indices, offsets)
         return self.fc(emb_lookup.cuda(self.device))
 # END hybrid_model
 
 # BEGIN setup_trainer
-def _retrieve_embedding_parameters(emb_rref):
-    param_rrefs = []
-    for param in emb_rref.local_value().parameters():
-        param_rrefs.append(RRef(param))
-    return param_rrefs
-
-
-def _run_trainer(emb_rref, rank):
+def _run_trainer(remote_emb_module, rank):
     r"""
     Each trainer runs a forward pass which involves an embedding lookup on the
     parameter server and running nn.Linear locally. During the backward pass,
@@ -57,16 +46,18 @@ def _run_trainer(emb_rref, rank):
     """
 
     # Setup the model.
-    model = HybridModel(emb_rref, rank)
+    model = HybridModel(remote_emb_module, rank)
 
     # Retrieve all model parameters as rrefs for DistributedOptimizer.
 
     # Retrieve parameters for embedding table.
-    model_parameter_rrefs = rpc.rpc_sync(
-            "ps", _retrieve_embedding_parameters, args=(emb_rref,))
+    model_parameter_rrefs = model.remote_emb_module.remote_parameters()
 
-    # model.parameters() only includes local parameters.
-    for param in model.parameters():
+    # model.fc.parameters() only includes local parameters.
+    # NOTE: Cannot call model.parameters() here,
+    # because this will call remote_emb_module.parameters(),
+    # which supports remote_parameters() but not parameters().
+    for param in model.fc.parameters():
         model_parameter_rrefs.append(RRef(param))
 
     # Setup distributed optimizer
@@ -115,43 +106,43 @@ def get_next_batch(rank):
                 # Not necessary to zero grads as each iteration creates a different
                 # distributed autograd context which hosts different grads
         print("Training done for epoch {}".format(epoch))
-    # END run_trainer
-
+        # END run_trainer
 
 # BEGIN run_worker
 def run_worker(rank, world_size):
     r"""
     A wrapper function that initializes RPC, calls the function, and shuts down
     RPC.
     """
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '29500'
-
 
+    # We need to use different port numbers in TCP init_method for init_rpc and
+    # init_process_group to avoid port conflicts.
     rpc_backend_options = TensorPipeRpcBackendOptions()
-    rpc_backend_options.init_method='tcp://localhost:29501'
+    rpc_backend_options.init_method = "tcp://localhost:29501"
 
     # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
     if rank == 2:
         rpc.init_rpc(
-                "master",
-                rank=rank,
-                world_size=world_size,
-                rpc_backend_options=rpc_backend_options)
-
-        # Build the embedding table on the ps.
-        emb_rref = rpc.remote(
-                "ps",
-                torch.nn.EmbeddingBag,
-                args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
-                kwargs={"mode": "sum"})
+            "master",
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        remote_emb_module = RemoteModule(
+            "ps",
+            torch.nn.EmbeddingBag,
+            args=(NUM_EMBEDDINGS, EMBEDDING_DIM),
+            kwargs={"mode": "sum"},
+        )
 
         # Run the training loop on trainers.
         futs = []
         for trainer_rank in [0, 1]:
             trainer_name = "trainer{}".format(trainer_rank)
             fut = rpc.rpc_async(
-                    trainer_name, _run_trainer, args=(emb_rref, rank))
+                trainer_name, _run_trainer, args=(remote_emb_module, rank)
+            )
             futs.append(fut)
 
         # Wait for all training to finish.
@@ -160,32 +151,35 @@ def run_worker(rank, world_size):
     elif rank <= 1:
         # Initialize process group for Distributed DataParallel on trainers.
         dist.init_process_group(
-                backend="gloo", rank=rank, world_size=2)
+            backend="gloo", rank=rank, world_size=2, init_method="tcp://localhost:29500"
+        )
 
         # Initialize RPC.
         trainer_name = "trainer{}".format(rank)
         rpc.init_rpc(
-                trainer_name,
-                rank=rank,
-                world_size=world_size,
-                rpc_backend_options=rpc_backend_options)
+            trainer_name,
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
 
         # Trainer just waits for RPCs from master.
     else:
         rpc.init_rpc(
-                "ps",
-                rank=rank,
-                world_size=world_size,
-                rpc_backend_options=rpc_backend_options)
+            "ps",
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
         # parameter server do nothing
         pass
 
     # block until all rpcs finish
     rpc.shutdown()
 
 
-if __name__=="__main__":
+if __name__ == "__main__":
     # 2 trainers, 1 parameter server, 1 master.
     world_size = 4
-    mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)
+    mp.spawn(run_worker, args=(world_size,), nprocs=world_size, join=True)
 # END run_worker