Make main_amp.py more profiling-friendly

definitelynotmcarilli · definitelynotmcarilli · commit f29b3f8d3859 · 2019-06-21T15:03:38.000-07:00
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
@@ -173,3 +173,9 @@ Running with the `--deterministic` flag should produce bitwise identical outputs
 regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
 Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
 cause a modest performance decrease.
+
+## Profiling
+
+If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
+Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
+[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).
diff --git a/examples/imagenet/main_amp.py b/examples/imagenet/main_amp.py
@@ -60,7 +60,7 @@
 parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                     help='use pre-trained model')
 
-parser.add_argument('--prof', dest='prof', action='store_true',
+parser.add_argument('--prof', default=-1, type=int,
                     help='Only run 10 iterations for profiling.')
 parser.add_argument('--deterministic', action='store_true')
 
@@ -236,8 +236,7 @@ def resume():
 
         # train for one epoch
         train(train_loader, model, criterion, optimizer, epoch)
-        if args.prof:
-            break
+
         # evaluate on validation set
         prec1 = validate(val_loader, model, criterion)
 
@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch):
     i = 0
     while input is not None:
         i += 1
+        if args.prof >= 0 and i == args.prof:
+            print("Profiling begun at iteration {}".format(i))
+            torch.cuda.cudart().cudaProfilerStart()
 
-        adjust_learning_rate(optimizer, epoch, i, len(train_loader))
+        if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))
 
-        if args.prof:
-            if i > 10:
-                break
+        adjust_learning_rate(optimizer, epoch, i, len(train_loader))
 
         # compute output
-        if args.prof: torch.cuda.nvtx.range_push("forward")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
         output = model(input)
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
         loss = criterion(output, target)
 
         # compute gradient and do SGD step
         optimizer.zero_grad()
 
-        if args.prof: torch.cuda.nvtx.range_push("backward")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
         with amp.scale_loss(loss, optimizer) as scaled_loss:
             scaled_loss.backward()
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
 
         # for param in model.parameters():
         #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())
 
-        if args.prof: torch.cuda.nvtx.range_push("step")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
         optimizer.step()
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
 
         if i%args.print_freq == 0:
             # Every print_freq iterations, check the loss, accuracy, and speed.
@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch):
                        args.world_size*args.batch_size/batch_time.avg,
                        batch_time=batch_time,
                        loss=losses, top1=top1, top5=top5))
-
+        if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
         input, target = prefetcher.next()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+
+        # Pop range "Body of iteration {}".format(i)
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+
+        if args.prof >= 0 and i == args.prof + 10:
+            print("Profiling ended at iteration {}".format(i))
+            torch.cuda.cudart().cudaProfilerStop()
+            quit()
 
 
 def validate(val_loader, model, criterion):