|
1 | 1 | """ |
2 | 2 | PyTorch Profiler |
3 | 3 | ==================================== |
| 4 | +**Author:** `Shivam Raikundalia <https://github.com/sraikund16>`_ |
| 5 | +""" |
| 6 | + |
| 7 | +###################################################################### |
| 8 | +""" |
4 | 9 | This recipe explains how to use PyTorch profiler and measure the time and |
5 | 10 | memory consumption of the model's operators. |
6 | 11 |
|
|
12 | 17 | In this recipe, we will use a simple Resnet model to demonstrate how to |
13 | 18 | use profiler to analyze model performance. |
14 | 19 |
|
| 20 | +Prerequisites |
| 21 | +--------------- |
| 22 | +- ``torch >= 1.9`` |
| 23 | +
|
15 | 24 | Setup |
16 | 25 | ----- |
17 | 26 | To install ``torch`` and ``torchvision`` use the following command: |
|
20 | 29 |
|
21 | 30 | pip install torch torchvision |
22 | 31 |
|
23 | | -
|
24 | 32 | """ |
25 | 33 |
|
26 | | - |
27 | 34 | ###################################################################### |
28 | 35 | # Steps |
29 | 36 | # ----- |
|
45 | 52 |
|
46 | 53 | import torch |
47 | 54 | import torchvision.models as models |
48 | | -from torch.profiler import profile, record_function, ProfilerActivity |
| 55 | +from torch.profiler import profile, ProfilerActivity, record_function |
49 | 56 |
|
50 | 57 |
|
51 | 58 | ###################################################################### |
|
135 | 142 | # To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True`` |
136 | 143 | # (note: this requires running the profiler with ``record_shapes=True``): |
137 | 144 |
|
138 | | -print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)) |
| 145 | +print( |
| 146 | + prof.key_averages(group_by_input_shape=True).table( |
| 147 | + sort_by="cpu_time_total", row_limit=10 |
| 148 | + ) |
| 149 | +) |
139 | 150 |
|
140 | 151 | ######################################################################################## |
141 | 152 | # The output might look like this (omitting some columns): |
|
167 | 178 | # Users could switch between cpu, cuda and xpu |
168 | 179 | activities = [ProfilerActivity.CPU] |
169 | 180 | if torch.cuda.is_available(): |
170 | | - device = 'cuda' |
| 181 | + device = "cuda" |
171 | 182 | activities += [ProfilerActivity.CUDA] |
172 | 183 | elif torch.xpu.is_available(): |
173 | | - device = 'xpu' |
| 184 | + device = "xpu" |
174 | 185 | activities += [ProfilerActivity.XPU] |
175 | 186 | else: |
176 | | - print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices') |
| 187 | + print( |
| 188 | + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" |
| 189 | + ) |
177 | 190 | import sys |
| 191 | + |
178 | 192 | sys.exit(0) |
179 | 193 |
|
180 | 194 | sort_by_keyword = device + "_time_total" |
|
256 | 270 | model = models.resnet18() |
257 | 271 | inputs = torch.randn(5, 3, 224, 224) |
258 | 272 |
|
259 | | -with profile(activities=[ProfilerActivity.CPU], |
260 | | - profile_memory=True, record_shapes=True) as prof: |
| 273 | +with profile( |
| 274 | + activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True |
| 275 | +) as prof: |
261 | 276 | model(inputs) |
262 | 277 |
|
263 | 278 | print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) |
|
312 | 327 | # Users could switch between cpu, cuda and xpu |
313 | 328 | activities = [ProfilerActivity.CPU] |
314 | 329 | if torch.cuda.is_available(): |
315 | | - device = 'cuda' |
| 330 | + device = "cuda" |
316 | 331 | activities += [ProfilerActivity.CUDA] |
317 | 332 | elif torch.xpu.is_available(): |
318 | | - device = 'xpu' |
| 333 | + device = "xpu" |
319 | 334 | activities += [ProfilerActivity.XPU] |
320 | 335 | else: |
321 | | - print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices') |
| 336 | + print( |
| 337 | + "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices" |
| 338 | + ) |
322 | 339 | import sys |
| 340 | + |
323 | 341 | sys.exit(0) |
324 | 342 |
|
325 | 343 | model = models.resnet18().to(device) |
|
347 | 365 | with profile( |
348 | 366 | activities=activities, |
349 | 367 | with_stack=True, |
| 368 | + experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True), |
350 | 369 | ) as prof: |
351 | 370 | model(inputs) |
352 | 371 |
|
|
401 | 420 |
|
402 | 421 | from torch.profiler import schedule |
403 | 422 |
|
404 | | -my_schedule = schedule( |
405 | | - skip_first=10, |
406 | | - wait=5, |
407 | | - warmup=1, |
408 | | - active=3, |
409 | | - repeat=2) |
| 423 | +my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2) |
410 | 424 |
|
411 | 425 | ###################################################################### |
412 | 426 | # Profiler assumes that the long-running job is composed of steps, numbered |
|
444 | 458 |
|
445 | 459 | sort_by_keyword = "self_" + device + "_time_total" |
446 | 460 |
|
| 461 | + |
447 | 462 | def trace_handler(p): |
448 | 463 | output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10) |
449 | 464 | print(output) |
450 | 465 | p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json") |
451 | 466 |
|
| 467 | + |
452 | 468 | with profile( |
453 | 469 | activities=activities, |
454 | | - schedule=torch.profiler.schedule( |
455 | | - wait=1, |
456 | | - warmup=1, |
457 | | - active=2), |
458 | | - on_trace_ready=trace_handler |
| 470 | + schedule=torch.profiler.schedule(wait=1, warmup=1, active=2), |
| 471 | + on_trace_ready=trace_handler, |
459 | 472 | ) as p: |
460 | 473 | for idx in range(8): |
461 | 474 | model(inputs) |
|
0 commit comments