|
60 | 60 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', |
61 | 61 | help='use pre-trained model') |
62 | 62 |
|
63 | | -parser.add_argument('--prof', dest='prof', action='store_true', |
| 63 | +parser.add_argument('--prof', default=-1, type=int, |
64 | 64 | help='Only run 10 iterations for profiling.') |
65 | 65 | parser.add_argument('--deterministic', action='store_true') |
66 | 66 |
|
@@ -236,8 +236,7 @@ def resume(): |
236 | 236 |
|
237 | 237 | # train for one epoch |
238 | 238 | train(train_loader, model, criterion, optimizer, epoch) |
239 | | - if args.prof: |
240 | | - break |
| 239 | + |
241 | 240 | # evaluate on validation set |
242 | 241 | prec1 = validate(val_loader, model, criterion) |
243 | 242 |
|
@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch): |
323 | 322 | i = 0 |
324 | 323 | while input is not None: |
325 | 324 | i += 1 |
| 325 | + if args.prof >= 0 and i == args.prof: |
| 326 | + print("Profiling begun at iteration {}".format(i)) |
| 327 | + torch.cuda.cudart().cudaProfilerStart() |
326 | 328 |
|
327 | | - adjust_learning_rate(optimizer, epoch, i, len(train_loader)) |
| 329 | + if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i)) |
328 | 330 |
|
329 | | - if args.prof: |
330 | | - if i > 10: |
331 | | - break |
| 331 | + adjust_learning_rate(optimizer, epoch, i, len(train_loader)) |
332 | 332 |
|
333 | 333 | # compute output |
334 | | - if args.prof: torch.cuda.nvtx.range_push("forward") |
| 334 | + if args.prof >= 0: torch.cuda.nvtx.range_push("forward") |
335 | 335 | output = model(input) |
336 | | - if args.prof: torch.cuda.nvtx.range_pop() |
| 336 | + if args.prof >= 0: torch.cuda.nvtx.range_pop() |
337 | 337 | loss = criterion(output, target) |
338 | 338 |
|
339 | 339 | # compute gradient and do SGD step |
340 | 340 | optimizer.zero_grad() |
341 | 341 |
|
342 | | - if args.prof: torch.cuda.nvtx.range_push("backward") |
| 342 | + if args.prof >= 0: torch.cuda.nvtx.range_push("backward") |
343 | 343 | with amp.scale_loss(loss, optimizer) as scaled_loss: |
344 | 344 | scaled_loss.backward() |
345 | | - if args.prof: torch.cuda.nvtx.range_pop() |
| 345 | + if args.prof >= 0: torch.cuda.nvtx.range_pop() |
346 | 346 |
|
347 | 347 | # for param in model.parameters(): |
348 | 348 | # print(param.data.double().sum().item(), param.grad.data.double().sum().item()) |
349 | 349 |
|
350 | | - if args.prof: torch.cuda.nvtx.range_push("step") |
| 350 | + if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()") |
351 | 351 | optimizer.step() |
352 | | - if args.prof: torch.cuda.nvtx.range_pop() |
| 352 | + if args.prof >= 0: torch.cuda.nvtx.range_pop() |
353 | 353 |
|
354 | 354 | if i%args.print_freq == 0: |
355 | 355 | # Every print_freq iterations, check the loss, accuracy, and speed. |
@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch): |
388 | 388 | args.world_size*args.batch_size/batch_time.avg, |
389 | 389 | batch_time=batch_time, |
390 | 390 | loss=losses, top1=top1, top5=top5)) |
391 | | - |
| 391 | + if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()") |
392 | 392 | input, target = prefetcher.next() |
| 393 | + if args.prof >= 0: torch.cuda.nvtx.range_pop() |
| 394 | + |
| 395 | + # Pop range "Body of iteration {}".format(i) |
| 396 | + if args.prof >= 0: torch.cuda.nvtx.range_pop() |
| 397 | + |
| 398 | + if args.prof >= 0 and i == args.prof + 10: |
| 399 | + print("Profiling ended at iteration {}".format(i)) |
| 400 | + torch.cuda.cudart().cudaProfilerStop() |
| 401 | + quit() |
393 | 402 |
|
394 | 403 |
|
395 | 404 | def validate(val_loader, model, criterion): |
|
0 commit comments