apache · DickJC123 · Sep 23, 2021 · Jul 9, 2020 · Aug 31, 2021 · Aug 18, 2020
@@ -3153,6 +3153,29 @@ MXNET_DLL int MXEnginePushSyncND(EngineSyncFunc sync_func, void* func_param,
 MXNET_DLL int MXCheckDynamicShapeOp(SymbolHandle sym_handle,
                                     bool* has_dynamic_shape);
 
+/*!
+  * \brief Push a new NVTX range. Requires building with CUDA and NVTX.
+  * \param name Name of the range.
+  * \param color Color used to display the range in the visual profiling tools.
+  *              Encoded as 256*256*R + 256*G + B.
+  */
+MXNET_DLL int MXNVTXRangePush(const char * name, mx_uint color);
+
+/*!
+  * \brief End the NVTX range. Requires building with CUDA and NVTX.
+  */
+MXNET_DLL int MXNVTXRangePop();
+
+/*!
+  * \brief Start CUDA profiling session. Requires building with CUDA and NVTX.
+  */
+MXNET_DLL int MXCUDAProfilerStart();
+
+/*!
+  * \brief End CUDA profiling session. Requires building with CUDA and NVTX.
+  */
+MXNET_DLL int MXCUDAProfilerStop();
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus

@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Provides python interface to CUDA-related functions of the MXNet library"""
+
+from ..base import _LIB, check_call
+from . import nvtx
+
+def cuda_profiler_start():
+    """Starts the CUDA profiler"""
+    check_call(_LIB.MXCUDAProfilerStart())
+
+def cuda_profiler_stop():
+    """Stops the CUDA profiler"""
+    check_call(_LIB.MXCUDAProfilerStop())
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities for NVTX usage in MXNet"""
+
+from ..base import _LIB, mx_uint, c_str, check_call
+
+# Palette of colors
+RED = 0xFF0000
+GREEN = 0x00FF00
+BLUE = 0x0000FF
+YELLOW = 0xB58900
+ORANGE = 0xCB4B16
+RED1 = 0xDC322F
+MAGENTA = 0xD33682
+VIOLET = 0x6C71C4
+BLUE1 = 0x268BD2
+CYAN = 0x2AA198
+GREEN1 = 0x859900
+
+def range_push(name, color=ORANGE):
+    """Starts a new named NVTX range."""
+    check_call(_LIB.MXNVTXRangePush(
+        c_str(name),
+        mx_uint(color)))
+
+def range_pop():
+    """Ends a NVTX range."""
+    check_call(_LIB.MXNVTXRangePop())
+
+class range:
+    def __init__(self, name, color=ORANGE):
+        self.name = name
+        self.color = color
+
+    def __enter__(self):
+        range_push(self.name, self.color)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        range_pop()
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -78,6 +78,11 @@
 #include <x86intrin.h>
 #endif
 
+#if MXNET_USE_CUDA
+#include <cuda_profiler_api.h>
+#endif
+#include "../common/cuda/nvtx.h"
+
 using namespace mxnet;
 
 // Internal function to get the information
@@ -3939,3 +3944,43 @@ int MXShallowCopyNDArray(NDArrayHandle src_handle, NDArrayHandle* out) {
   *out               = ret;
   API_END_HANDLE_ERROR(delete ret);
 }
+
+int MXNVTXRangePush(const char * name, mx_uint color) {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NVTX
+  mxnet::common::cuda::nvtx::gpuRangeStart(color, name);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NVTX=1 to have NVTX support.";
+#endif
+  API_END();
+}
+
+int MXNVTXRangePop() {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NVTX
+  mxnet::common::cuda::nvtx::gpuRangeStop();
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NVTX=1 to have NVTX support.";
+#endif
+  API_END();
+}
+
+int MXCUDAProfilerStart() {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NVTX
+  cudaProfilerStart();
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NVTX=1 to have CUDA profiler support.";
+#endif
+  API_END();
+}
+
+int MXCUDAProfilerStop() {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NVTX
+  cudaProfilerStop();
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NVTX=1 to have CUDA Profiler support.";
+#endif
+  API_END();
+}
diff --git a/src/common/cuda/nvtx.h b/src/common/cuda/nvtx.h
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_NVTX_H_
+#define MXNET_COMMON_CUDA_NVTX_H_
+
+#if MXNET_USE_CUDA && MXNET_USE_NVTX
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvToolsExtCuda.h>
+#include <vector>
+#include <string>
+#include <cstring>
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+
+class NVTXDuration {
+ public:
+  explicit NVTXDuration(const char *name) noexcept
+      : range_id_(0), name_(name) {}
+
+  inline void start() {
+    range_id_ = nvtxRangeStartA(name_);
+  }
+
+  inline void stop() {
+    nvtxRangeEnd(range_id_);
+  }
+
+ private:
+  nvtxRangeId_t range_id_;
+  const char *name_;
+};
+
+// Utility class for NVTX
+class nvtx {
+ public:
+  // Palette of colors (make sure to add new colors to the vector in nameToColor()).
+  static const uint32_t kRed     = 0xFF0000;
+  static const uint32_t kGreen   = 0x00FF00;
+  static const uint32_t kBlue    = 0x0000FF;
+  static const uint32_t kYellow  = 0xB58900;
+  static const uint32_t kOrange  = 0xCB4B16;
+  static const uint32_t kRed1    = 0xDC322F;
+  static const uint32_t kMagenta = 0xD33682;
+  static const uint32_t kViolet  = 0x6C71C4;
+  static const uint32_t kBlue1   = 0x268BD2;
+  static const uint32_t kCyan    = 0x2AA198;
+  static const uint32_t kGreen1  = 0x859900;
+
+  static void gpuRangeStart(const uint32_t rgb, const std::string& range_name) {
+    nvtxEventAttributes_t att;
+    att.version = NVTX_VERSION;
+    att.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    att.colorType = NVTX_COLOR_ARGB;
+    att.color = rgb | 0xff000000;
+    att.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    att.message.ascii = range_name.c_str();
+    nvtxRangePushEx(&att);
+  }
+
+  // Utility to map a range name prefix to a random color based on its hash
+  static uint32_t nameToColor(const std::string& range_name, int prefix_len) {
+    static std::vector<uint32_t> colors{kRed, kGreen, kBlue, kYellow, kOrange, kRed1, kMagenta,
+                                        kViolet, kBlue1, kCyan, kGreen1};
+    std::string s(range_name, 0, prefix_len);
+    std::hash<std::string> hash_fn;
+    return colors[hash_fn(s) % colors.size()];
+  }
+
+  // Utility to map a range name to a random color based on its hash
+  static uint32_t nameToColor(const std::string& range_name) {
+    return nameToColor(range_name, range_name.size());
+  }
+
+  static void gpuRangeStop() {
+    nvtxRangePop();
+  }
+};
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_UDE_CUDA && MXNET_USE_NVTX
+#endif  // MXNET_COMMON_CUDA_NVTX_H_
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
@@ -35,6 +35,7 @@
 #include "./thread_pool.h"
 #include "../common/lazy_alloc_array.h"
 #include "../common/utils.h"
+#include "../common/cuda/nvtx.h"
 
 namespace mxnet {
 namespace engine {
@@ -275,7 +276,19 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     OpenMP::Get()->on_start_worker_thread(false);
 
     while (task_queue->Pop(&opr_block)) {
+#if MXNET_USE_NVTX
+      auto nvtx_name = opr_block->opr->opr_name != "" ? opr_block->opr->opr_name : "Op";
+      auto end_pos = nvtx_name.find('{');
+      auto name_prefix_len = end_pos != std::string::npos
+                             ? end_pos
+                             : nvtx_name.size();
+      auto color = common::cuda::nvtx::nameToColor(nvtx_name, name_prefix_len);
+      common::cuda::nvtx::gpuRangeStart(color, nvtx_name);
+#endif
       this->ExecuteOprBlock(run_ctx, opr_block);
+#if MXNET_USE_NVTX
+      common::cuda::nvtx::gpuRangeStop();
+#endif
     }
 #else
     ready_event->signal();

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
@@ -1282,7 +1282,7 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
                               std::vector<EngineOprSeg>* opr_segs) {
   size_t seg_start = start_nid;
   std::vector<std::shared_ptr<exec::OpExecutor> > seg_execs;
-  std::string opr_names;
+  std::string opr_names = "[";
   for (size_t nid = start_nid; nid < end_nid; ++nid) {
     const auto& node = idx[nid];
     if (node.source->is_variable())
@@ -1302,6 +1302,8 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
       auto& seg = (*opr_segs)[seg_start];
       if (seg_execs.size()) {
         seg = EngineOprSeg{false, nid};
+        opr_names.pop_back();
+        opr_names += "]";
         seg.opr.reset(CreateEngineOp(default_ctx, seg_execs, opr_names.c_str()));
       } else {
         seg = EngineOprSeg{true, nid, nullptr};
@@ -1312,9 +1314,18 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
     }
 
     seg_execs.push_back(exec);
-    if (opr_names.size())
-      opr_names += ",";
+
+    const auto& inode = idx[nid];
     opr_names += op_name;
+    opr_names += "{name=" + inode.source->attrs.name + ";";
+    const std::unordered_map<std::string, std::string> &dict = inode.source->attrs.dict;
+    auto num_dict_entries = dict.size();
+    for (auto &k : dict) {
+      opr_names += k.first + "=" + k.second;
+      if (--num_dict_entries != 0)
+        opr_names += ";";
+    }
+    opr_names += "},";
 
     auto& seg = (*opr_segs)[nid];
     if (!valid) {
@@ -1324,6 +1335,8 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
       seg_start = nid + 1;
     } else if (is_async) {
       seg = EngineOprSeg{false, nid + 1};
+      opr_names.pop_back();
+      opr_names += "]";
       seg.opr.reset(CreateEngineOp(default_ctx, seg_execs, opr_names.c_str()));
       seg_execs.clear();
       opr_names.clear();
@@ -1335,6 +1348,8 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
     auto& seg = (*opr_segs)[seg_start];
     if (seg_execs.size()) {
       seg = EngineOprSeg{false, end_nid};
+      opr_names.pop_back();
+      opr_names += "]";
       seg.opr.reset(CreateEngineOp(default_ctx, seg_execs, opr_names.c_str()));
     } else {
       seg = EngineOprSeg{true, end_nid, nullptr};

diff --git a/src/profiler/nvtx.cc b/src/profiler/nvtx.cc