Add explanation of optim.zero_grad

chsasank · chsasank · commit 40d27fa71413 · 2018-01-22T11:10:35.000+05:30
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
@@ -253,3 +253,11 @@ def num_flat_features(self, x):
 loss = criterion(output, target)
 loss.backward()
 optimizer.step()    # Does the update
+
+
+###############################################################
+# .. Note::
+#
+#       Observe how gradient buffers had to be manually set to zero using
+#       ``optimizer.zero_grad()``. This is because gradients are accumulated
+#       as explained in `Backprop`_ section.
diff --git a/beginner_source/examples_nn/two_layer_net_optim.py b/beginner_source/examples_nn/two_layer_net_optim.py
@@ -47,8 +47,10 @@
     print(t, loss.data[0])
 
     # Before the backward pass, use the optimizer object to zero all of the
-    # gradients for the variables it will update (which are the learnable weights
-    # of the model)
+    # gradients for the variables it will update (which are the learnable
+    # weights of the model). This is because by default, gradients are
+    # accumulated in buffers( i.e, not overwritten) whenever .backward()
+    # is called. Checkout docs of torch.autograd.backward for more details.
     optimizer.zero_grad()
 
     # Backward pass: compute gradient of the loss with respect to model