@@ -58,11 +58,18 @@ class Executor {
58
58
/* *
59
59
@brief constructs the executor with @c N worker threads
60
60
61
+
62
+ @param N number of workers (default std::thread::hardware_concurrency)
63
+ @param wix worker interface class to alter worker (thread) behaviors
64
+
61
65
The constructor spawns @c N worker threads to run tasks in a
62
66
work-stealing loop. The number of workers must be greater than zero
63
67
or an exception will be thrown.
64
68
By default, the number of worker threads is equal to the maximum
65
69
hardware concurrency returned by std::thread::hardware_concurrency.
70
+
71
+ Users can alter the worker behavior, such as changing thread affinity,
72
+ via deriving an instance from tf::WorkerInterface.
66
73
*/
67
74
explicit Executor (
68
75
size_t N = std::thread::hardware_concurrency(),
@@ -1167,31 +1174,41 @@ inline size_t Executor::num_observers() const noexcept {
1167
1174
1168
1175
// Procedure: _schedule
1169
1176
inline void Executor::_schedule (Worker& worker, Node* node) {
1177
+
1178
+ // We need to fetch p before the release such that the read
1179
+ // operation is synchronized properly with other thread to
1180
+ // void data race.
1181
+ auto p = static_cast <unsigned >(node->_priority );
1170
1182
1171
1183
node->_state .fetch_or (Node::READY, std::memory_order_release);
1172
1184
1173
1185
// caller is a worker to this pool
1174
1186
if (worker._executor == this ) {
1175
- worker._wsq .push (node);
1187
+ worker._wsq .push (node, p );
1176
1188
return ;
1177
1189
}
1178
1190
1179
1191
{
1180
1192
std::lock_guard<std::mutex> lock (_wsq_mutex);
1181
- _wsq.push (node);
1193
+ _wsq.push (node, p );
1182
1194
}
1183
1195
1184
1196
_notifier.notify (false );
1185
1197
}
1186
1198
1187
1199
// Procedure: _schedule
1188
1200
inline void Executor::_schedule (Node* node) {
1201
+
1202
+ // We need to fetch p before the release such that the read
1203
+ // operation is synchronized properly with other thread to
1204
+ // void data race.
1205
+ auto p = static_cast <unsigned >(node->_priority );
1189
1206
1190
1207
node->_state .fetch_or (Node::READY, std::memory_order_release);
1191
1208
1192
1209
{
1193
1210
std::lock_guard<std::mutex> lock (_wsq_mutex);
1194
- _wsq.push (node);
1211
+ _wsq.push (node, p );
1195
1212
}
1196
1213
1197
1214
_notifier.notify (false );
@@ -1208,22 +1225,24 @@ inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes)
1208
1225
return ;
1209
1226
}
1210
1227
1211
- // make the node ready
1212
- for (size_t i=0 ; i<num_nodes; ++i) {
1213
- nodes[i]->_state .fetch_or (Node::READY, std::memory_order_release);
1214
- }
1215
-
1228
+ // We need to fetch p before the release such that the read
1229
+ // operation is synchronized properly with other thread to
1230
+ // void data race.
1216
1231
if (worker._executor == this ) {
1217
1232
for (size_t i=0 ; i<num_nodes; ++i) {
1218
- worker._wsq .push (nodes[i]);
1233
+ auto p = static_cast <unsigned >(nodes[i]->_priority );
1234
+ nodes[i]->_state .fetch_or (Node::READY, std::memory_order_release);
1235
+ worker._wsq .push (nodes[i], p);
1219
1236
}
1220
1237
return ;
1221
1238
}
1222
1239
1223
1240
{
1224
1241
std::lock_guard<std::mutex> lock (_wsq_mutex);
1225
1242
for (size_t k=0 ; k<num_nodes; ++k) {
1226
- _wsq.push (nodes[k]);
1243
+ auto p = static_cast <unsigned >(nodes[k]->_priority );
1244
+ nodes[k]->_state .fetch_or (Node::READY, std::memory_order_release);
1245
+ _wsq.push (nodes[k], p);
1227
1246
}
1228
1247
}
1229
1248
@@ -1240,15 +1259,15 @@ inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
1240
1259
return ;
1241
1260
}
1242
1261
1243
- // make the node ready
1244
- for (size_t i=0 ; i<num_nodes; ++i) {
1245
- nodes[i]->_state .fetch_or (Node::READY, std::memory_order_release);
1246
- }
1247
-
1262
+ // We need to fetch p before the release such that the read
1263
+ // operation is synchronized properly with other thread to
1264
+ // void data race.
1248
1265
{
1249
1266
std::lock_guard<std::mutex> lock (_wsq_mutex);
1250
1267
for (size_t k=0 ; k<num_nodes; ++k) {
1251
- _wsq.push (nodes[k]);
1268
+ auto p = static_cast <unsigned >(nodes[k]->_priority );
1269
+ nodes[k]->_state .fetch_or (Node::READY, std::memory_order_release);
1270
+ _wsq.push (nodes[k], p);
1252
1271
}
1253
1272
}
1254
1273
@@ -1376,7 +1395,9 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
1376
1395
auto & j = (node->_parent ) ? node->_parent ->_join_counter :
1377
1396
node->_topology ->_join_counter ;
1378
1397
1398
+ // Here, we want to cache the latest successor with the highest priority
1379
1399
Node* cache {nullptr };
1400
+ TaskPriority max_p {TaskPriority::MAX};
1380
1401
1381
1402
// At this point, the node storage might be destructed (to be verified)
1382
1403
// case 1: non-condition task
@@ -1391,10 +1412,16 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
1391
1412
// zeroing the join counter for invariant
1392
1413
s->_join_counter .store (0 , std::memory_order_relaxed);
1393
1414
j.fetch_add (1 );
1394
- if (cache) {
1395
- _schedule (worker, cache);
1415
+ if (s->_priority <= max_p) {
1416
+ if (cache) {
1417
+ _schedule (worker, cache);
1418
+ }
1419
+ cache = s;
1420
+ max_p = s->_priority ;
1421
+ }
1422
+ else {
1423
+ _schedule (worker, s);
1396
1424
}
1397
- cache = s;
1398
1425
}
1399
1426
}
1400
1427
}
@@ -1403,12 +1430,18 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
1403
1430
// non-condition task
1404
1431
default : {
1405
1432
for (size_t i=0 ; i<node->_successors .size (); ++i) {
1406
- if (--( node->_successors [i]->_join_counter ) == 0 ) {
1433
+ if (auto s = node->_successors [i]; --(s ->_join_counter ) == 0 ) {
1407
1434
j.fetch_add (1 );
1408
- if (cache) {
1409
- _schedule (worker, cache);
1435
+ if (s->_priority <= max_p) {
1436
+ if (cache) {
1437
+ _schedule (worker, cache);
1438
+ }
1439
+ cache = s;
1440
+ max_p = s->_priority ;
1441
+ }
1442
+ else {
1443
+ _schedule (worker, s);
1410
1444
}
1411
- cache = node->_successors [i];
1412
1445
}
1413
1446
}
1414
1447
}
0 commit comments