Merge branch 'master' into 1.9-RC-TEST

brianjo · web-flow · commit 270bf2253c08 · 2021-06-09T13:13:44.000-04:00
diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py
@@ -97,7 +97,7 @@ def forward(self, x):
 # Model Layers
 # -------------------------
 #
-# Lets break down the layers in the FashionMNIST model. To illustrate it, we 
+# Let's break down the layers in the FashionMNIST model. To illustrate it, we 
 # will take a sample minibatch of 3 images of size 28x28 and see what happens to it as 
 # we pass it through the network. 
 
diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py
@@ -35,7 +35,7 @@
 # -------------------
 #
 # Here is an example of how to load the `Fashion-MNIST <https://research.zalando.com/project/fashion_mnist/fashion_mnist/>`_ dataset from TorchVision.
-# Fashion-MNIST is a dataset of Zalando’s article images consisting of of 60,000 training examples and 10,000 test examples.
+# Fashion-MNIST is a dataset of Zalando’s article images consisting of 60,000 training examples and 10,000 test examples.
 # Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.
 #
 # We load the `FashionMNIST Dataset <https://pytorch.org/vision/stable/datasets.html#fashion-mnist>`_ with the following parameters:
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
@@ -52,7 +52,7 @@
 # with the discriminator. Let :math:`x` be data representing an image.
 # :math:`D(x)` is the discriminator network which outputs the (scalar)
 # probability that :math:`x` came from training data rather than the
-# generator. Here, since we are dealing with images the input to
+# generator. Here, since we are dealing with images, the input to
 # :math:`D(x)` is an image of CHW size 3x64x64. Intuitively, :math:`D(x)`
 # should be HIGH when :math:`x` comes from training data and LOW when
 # :math:`x` comes from the generator. :math:`D(x)` can also be thought of
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -101,7 +101,7 @@
 #
 # Before sending to the model, ``collate_fn`` function works on a batch of samples generated from ``DataLoader``. The input to ``collate_fn`` is a batch of data with the batch size in ``DataLoader``, and ``collate_fn`` processes them according to the data processing pipelines declared previously. Pay attention here and make sure that ``collate_fn`` is declared as a top level def. This ensures that the function is available in each worker.
 #
-# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of indidividual text entries.
+# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of individual text entries.
 
 
 from torch.utils.data import DataLoader
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
@@ -139,7 +139,7 @@
 # the networks later. To keep track of all this we will use a helper class
 # called ``Lang`` which has word → index (``word2index``) and index → word
 # (``index2word``) dictionaries, as well as a count of each word
-# ``word2count`` to use to later replace rare words.
+# ``word2count`` which will be used to replace rare words later.
 #
 
 SOS_token = 0
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
@@ -178,6 +178,69 @@ def fused_gelu(x):
 #   `torch.autograd.gradgradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradgradcheck>`_
 #
 
+###############################################################################
+# CPU specific optimizations
+# --------------------------
+
+###############################################################################
+# Utilize Non-Uniform Memory Access (NUMA) Controls
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# NUMA or non-uniform memory access is a memory layout design used in data center machines meant to take advantage of locality of memory in multi-socket machines with multiple memory controllers and blocks. Generally speaking, all deep learning workloads, training or inference, get better performance without accessing hardware resources across NUMA nodes. Thus, inference can be run with multiple instances, each instance runs on one socket, to raise throughput. For training tasks on single node, distributed training is recommended to make each training process run on one socket.
+#
+# In general cases the following command executes a PyTorch script on cores on the Nth node only, and avoids cross-socket memory access to reduce memory access overhead.
+
+# numactl --cpunodebind=N --membind=N python <pytorch_script>
+
+###############################################################################
+# More detailed descriptions can be found `here <https://software.intel.com/content/www/us/en/develop/articles/how-to-get-better-performance-on-pytorchcaffe2-with-intel-acceleration.html>`_.
+
+###############################################################################
+# Utilize OpenMP
+# ~~~~~~~~~~~~~~
+# OpenMP is utilized to bring better performance for parallel computation tasks.
+# OMP_NUM_THREADS is the easiest switch that can be used to accelerate computations. It determines number of threads used for OpenMP computations.
+# CPU affinity setting controls how workloads are distributed over multiple cores. It affects communication overhead, cache line invalidation overhead, or page thrashing, thus proper setting of CPU affinity brings performance benefits. GOMP_CPU_AFFINITY or KMP_AFFINITY determines how to bind OpenMP* threads to physical processing units. Detailed information can be found `here <https://software.intel.com/content/www/us/en/develop/articles/how-to-get-better-performance-on-pytorchcaffe2-with-intel-acceleration.html>`_.
+
+###############################################################################
+# With the following command, PyTorch run the task on N OpenMP threads.
+
+# export OMP_NUM_THREADS=N
+
+###############################################################################
+# Typically, the following environment variables are used to set for CPU affinity with GNU OpenMP implementation. OMP_PROC_BIND specifies whether threads may be moved between processors. Setting it to CLOSE keeps OpenMP threads close to the primary thread in contiguous place partitions. OMP_SCHEDULE determines how OpenMP threads are scheduled. GOMP_CPU_AFFINITY binds threads to specific CPUs.
+
+# export OMP_SCHEDULE=STATIC
+# export OMP_PROC_BIND=CLOSE
+# export GOMP_CPU_AFFINITY="N-M"
+
+###############################################################################
+# Intel OpenMP Runtime Library (libiomp)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# By default, PyTorch uses GNU OpenMP (GNU libgomp) for parallel computation. On Intel platforms, Intel OpenMP Runtime Library (libiomp) provides OpenMP API specification support. It sometimes brings more performance benefits compared to libgomp. Utilizing environment variable LD_PRELOAD can switch OpenMP library to libiomp:
+
+# export LD_PRELOAD=<path>/libiomp5.so:$LD_PRELOAD
+
+###############################################################################
+# Similar to CPU affinity settings in GNU OpenMP, environment variables are provided in libiomp to control CPU affinity settings.
+# KMP_AFFINITY binds OpenMP threads to physical processing units. KMP_BLOCKTIME sets the time, in milliseconds, that a thread should wait, after completing the execution of a parallel region, before sleeping. In most cases, setting KMP_BLOCKTIME to 1 or 0 yields good performances.
+# The following commands show a common settings with Intel OpenMP Runtime Library.
+
+# export KMP_AFFINITY=granularity=fine,compact,1,0
+# export KMP_BLOCKTIME=1
+
+###############################################################################
+# Switch Memory allocator
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# For deep learning workloads, Jemalloc or TCMalloc can get better performance by reusing memory as much as possible than default malloc funtion. `Jemalloc <https://github.com/jemalloc/jemalloc>`_ is a general purpose malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support. `TCMalloc <https://google.github.io/tcmalloc/overview.html>`_ also features a couple of optimizations to speed up program executions. One of them is holding memory in caches to speed up access of commonly-used objects. Holding such caches even after deallocation also helps avoid costly system calls if such memory is later re-allocated.
+# Use environment variable LD_PRELOAD to take advantage of one of them.
+
+# export LD_PRELOAD=<jemalloc.so/tcmalloc.so>:$LD_PRELOAD
+
+###############################################################################
+# Train a model on CPU with PyTorch DistributedDataParallel(DDP) functionality
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For small scale models or memory-bound models, such as DLRM, training on CPU is also a good choice. On a machine with multiple sockets, distributed training brings a high-efficient hardware resource usage to accelerate the training process. `Torch-ccl <https://github.com/intel/torch-ccl>`_, optimized with Intel(R) oneCCL (collective commnications library) for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall, implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup. Upon optimizations implemented in PyTorch DDP moduel, torhc-ccl accelerates communication operations. Beside the optimizations made to communication kernels, torch-ccl also features simultaneous computation-communication functionality.
+
 ###############################################################################
 # GPU specific optimizations
 # --------------------------

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ def forward(self, x):`
`97`	`97`	`# Model Layers`
`98`	`98`	`# -------------------------`
`99`	`99`	`#`
`100`		`-# Lets break down the layers in the FashionMNIST model. To illustrate it, we`
	`100`	`+# Let's break down the layers in the FashionMNIST model. To illustrate it, we`
`101`	`101`	`# will take a sample minibatch of 3 images of size 28x28 and see what happens to it as`
`102`	`102`	`# we pass it through the network.`
`103`	`103`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`# -------------------`
`36`	`36`	`#`
`37`	`37`	# Here is an example of how to load the `Fashion-MNIST <https://research.zalando.com/project/fashion_mnist/fashion_mnist/>`_ dataset from TorchVision.
`38`		`-# Fashion-MNIST is a dataset of Zalando’s article images consisting of of 60,000 training examples and 10,000 test examples.`
	`38`	`+# Fashion-MNIST is a dataset of Zalando’s article images consisting of 60,000 training examples and 10,000 test examples.`
`39`	`39`	`# Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.`
`40`	`40`	`#`
`41`	`41`	# We load the `FashionMNIST Dataset <https://pytorch.org/vision/stable/datasets.html#fashion-mnist>`_ with the following parameters:
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@`
`101`	`101`	`#`
`102`	`102`	# Before sending to the model, ``collate_fn`` function works on a batch of samples generated from ``DataLoader``. The input to ``collate_fn`` is a batch of data with the batch size in ``DataLoader``, and ``collate_fn`` processes them according to the data processing pipelines declared previously. Pay attention here and make sure that ``collate_fn`` is declared as a top level def. This ensures that the function is available in each worker.
`103`	`103`	`#`
`104`		-# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of indidividual text entries.
	`104`	+# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of individual text entries.
`105`	`105`
`106`	`106`
`107`	`107`	`from torch.utils.data import DataLoader`
Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@`
`139`	`139`	`# the networks later. To keep track of all this we will use a helper class`
`140`	`140`	# called ``Lang`` which has word → index (``word2index``) and index → word
`141`	`141`	# (``index2word``) dictionaries, as well as a count of each word
`142`		-# ``word2count`` to use to later replace rare words.
	`142`	+# ``word2count`` which will be used to replace rare words later.
`143`	`143`	`#`
`144`	`144`
`145`	`145`	`SOS_token = 0`