the-mastercoder
diff --git a/‎3rd-party/tbb/cmake/TBBConfig.cmake‎
Lines changed: 2 additions & 2 deletions b/‎3rd-party/tbb/cmake/TBBConfig.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doxygen/releases/release-3.5.0.dox‎
Lines changed: 5 additions & 1 deletion b/‎doxygen/releases/release-3.5.0.dox‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 5 additions & 4 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/parallel_pipeline.cpp‎ renamed to ‎examples/pipeline.cpp‎ b/‎examples/parallel_pipeline.cpp‎ renamed to ‎examples/pipeline.cpp‎
diff --git a/‎examples/priority.cpp‎
Lines changed: 55 additions & 0 deletions b/‎examples/priority.cpp‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎examples/parallel_scalable_pipeline.cpp‎ renamed to ‎examples/scalable_pipeline.cpp‎ b/‎examples/parallel_scalable_pipeline.cpp‎ renamed to ‎examples/scalable_pipeline.cpp‎
diff --git a/‎examples/parallel_taskflow_pipeline.cpp‎ renamed to ‎examples/taskflow_pipeline.cpp‎ b/‎examples/parallel_taskflow_pipeline.cpp‎ renamed to ‎examples/taskflow_pipeline.cpp‎
diff --git a/‎examples/parallel_text_pipeline.cpp‎ renamed to ‎examples/text_pipeline.cpp‎ b/‎examples/parallel_text_pipeline.cpp‎ renamed to ‎examples/text_pipeline.cpp‎
diff --git a/‎taskflow/core/executor.hpp‎
Lines changed: 56 additions & 23 deletions b/‎taskflow/core/executor.hpp‎
Lines changed: 56 additions & 23 deletions
diff --git a/‎taskflow/core/graph.hpp‎
Lines changed: 3 additions & 0 deletions b/‎taskflow/core/graph.hpp‎
Lines changed: 3 additions & 0 deletions
@@ -48,10 +48,10 @@ get_filename_component(_tbb_root "${_tbb_root}" PATH)
 foreach (_tbb_component ${TBB_FIND_COMPONENTS})
     set(TBB_${_tbb_component}_FOUND 0)
 
-    set(_tbb_release_lib "/home/xiongzc/taskflow/build/benchmarks/tbb_cmake_build/tbb_cmake_build_subdir_release/lib${_tbb_component}.so.2")
+    set(_tbb_release_lib "/home/twhuang/Code/taskflow/build/benchmarks/tbb_cmake_build/tbb_cmake_build_subdir_release/lib${_tbb_component}.so.2")
 
     if (NOT TBB_FIND_RELEASE_ONLY)
-        set(_tbb_debug_lib "/home/xiongzc/taskflow/build/benchmarks/tbb_cmake_build/tbb_cmake_build_subdir_debug/lib${_tbb_component}_debug.so.2")
+        set(_tbb_debug_lib "/home/twhuang/Code/taskflow/build/benchmarks/tbb_cmake_build/tbb_cmake_build_subdir_debug/lib${_tbb_component}_debug.so.2")
     endif()
 
     if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}")
 
@@ -38,14 +38,18 @@ To use %Taskflow v3.5.0, you need a compiler that supports C++17:
 
 + Added tf::WorkerInterface to allow changing properties of workers upon their creations
 + Added tf::Executor::loop_until to allow looping a worker with a custom stop predicate
-+ Extended tf::Executor to take tf::WorkerInterface
++ Extended tf::TaskQueue to include priority (tf::TaskPriority)
++ Extended tf::Executor to include tf::WorkerInterface
 
 @subsection release-3-5-0_cudaflow cudaFlow
 
 @subsection release-3-5-0_syclflow syclFlow
 
 @subsection release-3-5-0_utilities Utilities
 
++ Added tf::unroll to unroll loops using template techniques
++ Added tf::CachelineAligned to create a cacheline-aligned struct
+
 @subsection release-3-5-0_profiler Taskflow Profiler (TFProf)
 
 @section release-3-5-0_bug_fixes Bug Fixes 
 
@@ -13,15 +13,16 @@ list(APPEND TF_EXAMPLES
   while_loop
   if_else
   nested_if_else
+  priority
   visualization 
   reduce 
   parallel_for 
   parallel_sort
-  parallel_pipeline
-  parallel_scalable_pipeline
-  parallel_text_pipeline
+  pipeline
+  scalable_pipeline
+  text_pipeline
+  taskflow_pipeline
   parallel_graph_pipeline
-  parallel_taskflow_pipeline
   data_parallel_pipeline
   run 
   run_and_wait
 
@@ -0,0 +1,55 @@
+// This program demonstrates how to set priority to a task.
+//
+// Currently, Taskflow supports only three priority levels:
+//   + tf::TaskPriority::HIGH   (numerical value = 0)
+//   + tf::TaskPriority::NORMAL (numerical value = 1)
+//   + tf::TaskPriority::LOW    (numerical value = 2)
+// 
+// Priority-based execution is non-preemptive. Once a task 
+// has started to execute, it will execute to completion,
+// even if a higher priority task has been spawned or enqueued. 
+
+#include <taskflow/taskflow.hpp>
+
+int main() {
+  
+  // create an executor of only one worker to enable 
+  // deterministic behavior
+  tf::Executor executor(1);
+
+  tf::Taskflow taskflow;
+
+  int counter {0};
+  
+  // Here we create five tasks and print thier execution
+  // orders which should align with assigned priorities
+  auto [A, B, C, D, E] = taskflow.emplace(
+    [] () { },
+    [&] () { 
+      std::cout << "Task B: " << counter++ << '\n';  // 0
+    },
+    [&] () { 
+      std::cout << "Task C: " << counter++ << '\n';  // 2
+    },
+    [&] () { 
+      std::cout << "Task D: " << counter++ << '\n';  // 1
+    },
+    [] () { }
+  );
+
+  A.precede(B, C, D); 
+  E.succeed(B, C, D);
+  
+  // By default, all tasks are of tf::TaskPriority::HIGH
+  B.priority(tf::TaskPriority::HIGH);
+  C.priority(tf::TaskPriority::LOW);
+  D.priority(tf::TaskPriority::NORMAL);
+
+  assert(B.priority() == tf::TaskPriority::HIGH);
+  assert(C.priority() == tf::TaskPriority::LOW);
+  assert(D.priority() == tf::TaskPriority::NORMAL);
+  
+  // we should see B, D, and C in their priority order
+  executor.run(taskflow).wait();
+}
+
@@ -58,11 +58,18 @@ class Executor {
     /**
     @brief constructs the executor with @c N worker threads
 
+
+    @param N number of workers (default std::thread::hardware_concurrency)
+    @param wix worker interface class to alter worker (thread) behaviors
+    
     The constructor spawns @c N worker threads to run tasks in a
     work-stealing loop. The number of workers must be greater than zero
     or an exception will be thrown.
     By default, the number of worker threads is equal to the maximum
     hardware concurrency returned by std::thread::hardware_concurrency.
+
+    Users can alter the worker behavior, such as changing thread affinity,
+    via deriving an instance from tf::WorkerInterface.
     */
     explicit Executor(
       size_t N = std::thread::hardware_concurrency(),
@@ -1167,31 +1174,41 @@ inline size_t Executor::num_observers() const noexcept {
 
 // Procedure: _schedule
 inline void Executor::_schedule(Worker& worker, Node* node) {
+  
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = static_cast<unsigned>(node->_priority);
 
   node->_state.fetch_or(Node::READY, std::memory_order_release);
 
   // caller is a worker to this pool
   if(worker._executor == this) {
-    worker._wsq.push(node);
+    worker._wsq.push(node, p);
     return;
   }
 
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
-    _wsq.push(node);
+    _wsq.push(node, p);
   }
 
   _notifier.notify(false);
 }
 
 // Procedure: _schedule
 inline void Executor::_schedule(Node* node) {
+  
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = static_cast<unsigned>(node->_priority);
 
   node->_state.fetch_or(Node::READY, std::memory_order_release);
 
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
-    _wsq.push(node);
+    _wsq.push(node, p);
   }
 
   _notifier.notify(false);
@@ -1208,22 +1225,24 @@ inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes)
     return;
   }
 
-  // make the node ready
-  for(size_t i=0; i<num_nodes; ++i) {
-    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
-  }
-
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
   if(worker._executor == this) {
     for(size_t i=0; i<num_nodes; ++i) {
-      worker._wsq.push(nodes[i]);
+      auto p = static_cast<unsigned>(nodes[i]->_priority);
+      nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+      worker._wsq.push(nodes[i], p);
     }
     return;
   }
 
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
     for(size_t k=0; k<num_nodes; ++k) {
-      _wsq.push(nodes[k]);
+      auto p = static_cast<unsigned>(nodes[k]->_priority);
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
     }
   }
 
@@ -1240,15 +1259,15 @@ inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
     return;
   }
 
-  // make the node ready
-  for(size_t i=0; i<num_nodes; ++i) {
-    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
-  }
-
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
     for(size_t k=0; k<num_nodes; ++k) {
-      _wsq.push(nodes[k]);
+      auto p = static_cast<unsigned>(nodes[k]->_priority);
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
     }
   }
 
@@ -1376,7 +1395,9 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
   auto& j = (node->_parent) ? node->_parent->_join_counter :
                               node->_topology->_join_counter;
 
+  // Here, we want to cache the latest successor with the highest priority
   Node* cache {nullptr};
+  TaskPriority max_p {TaskPriority::MAX};
 
   // At this point, the node storage might be destructed (to be verified)
   // case 1: non-condition task
@@ -1391,10 +1412,16 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
           // zeroing the join counter for invariant
           s->_join_counter.store(0, std::memory_order_relaxed);
           j.fetch_add(1);
-          if(cache) {
-            _schedule(worker, cache);
+          if(s->_priority <= max_p) {
+            if(cache) {
+              _schedule(worker, cache);
+            }
+            cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
           }
-          cache = s;
         }
       }
     }
@@ -1403,12 +1430,18 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
     // non-condition task
     default: {
       for(size_t i=0; i<node->_successors.size(); ++i) {
-        if(--(node->_successors[i]->_join_counter) == 0) {
+        if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
           j.fetch_add(1);
-          if(cache) {
-            _schedule(worker, cache);
+          if(s->_priority <= max_p) {
+            if(cache) {
+              _schedule(worker, cache);
+            }
+            cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
           }
-          cache = node->_successors[i];
         }
       }
     }
 
@@ -11,6 +11,7 @@
 #include "semaphore.hpp"
 #include "environment.hpp"
 #include "topology.hpp"
+#include "tsq.hpp"
 
 /**
 @file graph.hpp
@@ -428,6 +429,8 @@ class Node {
   private:
 
   std::string _name;
+  
+  TaskPriority _priority {TaskPriority::HIGH};
 
   void* _data {nullptr};