added 03_autograd.py

patrickloeber · patrickloeber · commit 5c237c83704c · 2019-12-25T11:28:30.000+01:00
diff --git a/03_autograd.py b/03_autograd.py
@@ -0,0 +1,109 @@
+
+import torch
+# The autograd package provides automatic differentiation 
+# for all operations on Tensors
+
+# requires_grad = True -> tracks all operations on the tensor. 
+x = torch.randn(3, requires_grad=True)
+y = x + 2
+
+# y was created as a result of an operation, so it has a grad_fn attribute.
+# grad_fn: references a Function that has created the Tensor
+print(x) # created by the user -> grad_fn is None
+print(y)
+print(y.grad_fn)
+
+# Do more operations on y
+z = y * y * 3
+print(z)
+z = z.mean()
+print(z)
+
+# Let's compute the gradients with backpropagation
+# When we finish our computation we can call .backward() and have all the gradients computed automatically.
+# The gradient for this tensor will be accumulated into .grad attribute.
+# It is the partial derivate of the function w.r.t. the tensor
+
+z.backward()
+print(x.grad) # dz/dx
+
+# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
+# It computes partial derivates while applying the chain rule
+
+# -------------
+# Model with non-scalar output:
+# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
+# specify a gradient argument that is a tensor of matching shape.
+# needed for vector-Jacobian product
+
+x = torch.randn(3, requires_grad=True)
+
+y = x * 2
+for _ in range(10):
+    y = y * 2
+
+print(y)
+print(y.shape)
+
+v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
+y.backward(v)
+print(x.grad)
+
+# -------------
+# Stop a tensor from tracking history:
+# For example during our training loop when we want to update our weights
+# then this update operation should not be part of the gradient computation
+# - x.requires_grad_(False)
+# - x.detach()
+# - wrap in 'with torch.no_grad():'
+
+# .requires_grad_(...) changes an existing Tensor’s requires_grad flag in-place.
+a = torch.randn(2, 2)
+print(a.requires_grad)
+b = ((a * 3) / (a - 1))
+print(b.grad_fn)
+a.requires_grad_(True)
+print(a.requires_grad)
+b = (a * a).sum()
+print(b.grad_fn)
+
+# .detach(): get a new Tensor with the same content but no gradient computation:
+a = torch.randn(2, 2, requires_grad=True)
+print(a.requires_grad)
+b = a.detach()
+print(b.requires_grad)
+
+# wrap in 'with torch.no_grad():'
+a = torch.randn(2, 2, requires_grad=True)
+print(a.requires_grad)
+with torch.no_grad():
+    print((x ** 2).requires_grad)
+
+# -------------
+# with backward() the gradient for this tensor will be accumulated into .grad attribute.
+# !!! We need to be careful during optimization !!!
+# Use .zero_() to empty the gradients before a new optimization step!
+weights = torch.ones(4, requires_grad=True)
+
+for epoch in range(3):
+    # just a dummy example
+    model_output = (weights*3).sum()
+    model_output.backward()
+    
+    print(weights.grad)
+
+    # optimize model, i.e. adjust weights...
+    with torch.no_grad():
+        weights -= 0.1 * weights.grad
+
+    # this is important! It affects the final weights & output
+    weights.grad.zero_()
+
+print(weights)
+print(model_output)
+
+# Optimizer has zero_grad() method
+# optimizer = torch.optim.SGD([weights], lr=0.1)
+# During training:
+# optimizer.step()
+# optimizer.zero_grad()