update dist.reduce to proper dist.all_reduce (#1926)

lessw2020 · web-flow · commit e086db65021d · 2022-05-24T08:32:57.000-07:00
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
@@ -35,13 +35,13 @@ At high level FDSP works as follow:
 
 *In forward path*
 
-* Run allgather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
 * Run forward computation
 * Discard parameter shards it has just collected
 
 *In backward path*
 
-* Run allgather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
 * Run backward computation
 * Run reduce_scatter to sync gradients
 * Discard parameters. 
@@ -155,7 +155,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
             ddp_loss[0] += loss.item()
             ddp_loss[1] += len(data)
 
-        dist.reduce(ddp_loss, 0, op=dist.ReduceOp.SUM)
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
         if rank == 0:
             print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))
 
@@ -176,7 +176,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
                 ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
                 ddp_loss[2] += len(data)
 
-        dist.reduce(ddp_loss, 0, op=dist.ReduceOp.SUM)
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
 
         if rank == 0:
             test_loss = ddp_loss[0] / ddp_loss[2]