NVIDIA
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎apex/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎apex/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apex/pyprof/FAQs.md‎
Lines changed: 21 additions & 0 deletions b/‎apex/pyprof/FAQs.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎apex/pyprof/README.md‎
Lines changed: 252 additions & 0 deletions b/‎apex/pyprof/README.md‎
Lines changed: 252 additions & 0 deletions
diff --git a/‎apex/pyprof/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎apex/pyprof/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎apex/pyprof/examples/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎apex/pyprof/examples/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎apex/pyprof/examples/apex/README.md‎
Lines changed: 1 addition & 0 deletions b/‎apex/pyprof/examples/apex/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apex/pyprof/examples/apex/fused_adam.py‎
Lines changed: 20 additions & 0 deletions b/‎apex/pyprof/examples/apex/fused_adam.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎apex/pyprof/examples/apex/fused_layer_norm.py‎
Lines changed: 28 additions & 0 deletions b/‎apex/pyprof/examples/apex/fused_layer_norm.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎apex/pyprof/examples/apex/test.sh‎
Lines changed: 30 additions & 0 deletions b/‎apex/pyprof/examples/apex/test.sh‎
Lines changed: 30 additions & 0 deletions
@@ -94,6 +94,11 @@ A Python-only build omits:
 - Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
 `DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
 
+To enable PyProf support, you need to install the packages required by PyProf. To do so, add the "--pyprof" option at installation time:
+```
+$ pip install -v --no-cache-dir --global-option="--pyprof" --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+
 ### Windows support
 Windows support is experimental, and Linux is recommended.  `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
 on your system.  `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
@@ -1,5 +1,6 @@
 # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
 import torch
+import warnings
 
 from . import parallel
 from . import amp
@@ -14,3 +15,4 @@
 # load time) the error message is timely and visible.
 from . import optimizers
 from . import normalization
+from . import pyprof
@@ -0,0 +1,21 @@
+1. How do I intercept the Adam optimizer in APEX ?
+
+	```python
+	from apex import pyprof
+	import fused_adam_cuda
+	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
+	```
+
+2. If you are using JIT and/or AMP, the correct initialization sequence is
+	1. Let any JIT to finish.
+	2. Initlialize pyprof `pyprof.nvtx.init()`.
+	3. Initialize AMP.
+
+3. How do I profile with `torch.distributed.launch` ?
+
+	```python
+	nvprof -f -o net%p.sql \
+		--profile-from-start off \
+		--profile-child-processes \
+		python -m torch.distributed.launch net.py
+	```
@@ -0,0 +1,3 @@
+import warnings
+
+from . import nvtx
@@ -0,0 +1,4 @@
+__pycache__
+*.sql
+*.dict
+*.csv
@@ -0,0 +1 @@
+This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
@@ -0,0 +1,20 @@
+import torch
+import fused_adam_cuda
+from apex.optimizers import FusedAdam, FP16_Optimizer
+from apex import pyprof
+
+pyprof.nvtx.init()
+pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
+
+model = torch.nn.Linear(10, 20).cuda().half()
+criterion = torch.nn.CrossEntropyLoss().cuda()
+optimizer = FusedAdam(model.parameters())
+optimizer = FP16_Optimizer(optimizer)
+
+x = torch.ones(32, 10).cuda().half()
+target = torch.empty(32, dtype=torch.long).random_(20).cuda()
+y = model(x)
+loss = criterion(y, target)
+optimizer.zero_grad()
+loss.backward()
+optimizer.step()
@@ -0,0 +1,28 @@
+import torch
+import fused_layer_norm_cuda
+from apex.normalization import FusedLayerNorm
+from apex import pyprof
+
+pyprof.nvtx.init()
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
+pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')
+
+input = torch.randn(20, 5, 10, 10).cuda()
+
+# With Learnable Parameters
+m = FusedLayerNorm(input.size()[1:]).cuda()
+output = m(input)
+
+# Without Learnable Parameters
+m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
+output = m(input)
+
+# Normalize over last two dimensions
+m = FusedLayerNorm([10, 10]).cuda()
+output = m(input)
+
+# Normalize over last dimension of size 10
+m = FusedLayerNorm(10).cuda()
+output = m(input)
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+SCRIPT=`realpath $0`
+SCRIPTPATH=`dirname $SCRIPT`
+PYPROF="$SCRIPTPATH/../.."
+
+parse="python $PYPROF/parse/parse.py"
+prof="python $PYPROF/prof/prof.py"
+
+for f in *.py
+do
+	base=`basename $f .py`
+	sql=$base.sql
+	dict=$base.dict
+
+	#NVprof
+	echo "nvprof -fo $sql python $f"
+	nvprof -fo $sql python $f
+
+	#Parse
+	echo $parse $sql
+	$parse $sql > $dict
+
+	#Prof
+	echo $prof $dict
+	$prof -w 130 $dict
+	\rm $sql $dict
+done
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import warnings`
	`2`	`+`
	`3`	`+from . import nvtx`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +__pycache__
 +*.sql
 +*.dict
 +*.csv
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.