Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d88434e

Browse files
committed
MN schedulding limited to groups
1 parent b08f0b5 commit d88434e

16 files changed

Lines changed: 157 additions & 42 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ instead of removed.
2626
If the limit is reached, the task is marked as failed.
2727
The limit can be configured by `--crash-limit` in submit.
2828

29+
* Groups of workers are introduced. A multi-node task is now started only on workers from the same group.
30+
By default, workers are grouped by PBS/Slurm allocations, but it can be configured manually.
31+
2932
## Changes
3033

3134
### Resource management

crates/pyhq/src/cluster/worker.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ impl RunningWorker {
4242
}]),
4343
listen_address: Default::default(),
4444
hostname: get_hostname(None),
45+
group: "default".to_string(),
4546
work_dir,
4647
log_dir,
4748
heartbeat_interval: Duration::from_secs(10),

crates/tako/src/internal/scheduler/state.rs

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -333,16 +333,40 @@ impl SchedulerState {
333333
// }
334334

335335
fn try_start_multinode_tasks(&mut self, core: &mut Core) {
336+
let mut selected_workers = Vec::new();
336337
loop {
337338
// "while let" not used because of lifetime problems
338-
let (mn_queue, task_map, worker_map) = core.multi_node_queue_split_mut();
339+
let (mn_queue, task_map, worker_map, worker_groups) = core.multi_node_queue_split_mut();
339340
if let Some((task_id, _)) = mn_queue.queue.peek() {
340341
let task_id = *task_id;
341342
let task = task_map.get_task_mut(task_id);
342343
let n_nodes = task.configuration.resources.n_nodes() as usize;
343344
assert!(n_nodes > 0);
344345

345-
if worker_map.len() < n_nodes {
346+
let mut found = false;
347+
let mut big_enough = false;
348+
'outer: for group in worker_groups.values() {
349+
if group.size() < n_nodes {
350+
continue;
351+
}
352+
big_enough = true;
353+
selected_workers.clear();
354+
for worker_id in group.worker_ids() {
355+
let worker = worker_map.get_worker(worker_id);
356+
if worker.is_free() {
357+
selected_workers.push(worker_id);
358+
}
359+
if selected_workers.len() == n_nodes {
360+
found = true;
361+
break 'outer;
362+
}
363+
}
364+
}
365+
if found {
366+
mn_queue.queue.pop();
367+
self.assign_multinode(worker_map, task, std::mem::take(&mut selected_workers));
368+
continue;
369+
} else if !big_enough {
346370
log::debug!(
347371
"Multi-node task {} put into sleep. (n_nodes={}, workers={})",
348372
task_id,
@@ -352,23 +376,6 @@ impl SchedulerState {
352376
mn_queue.queue.pop();
353377
core.add_sleeping_mn_task(task_id);
354378
continue;
355-
}
356-
357-
let mut selected_workers = Vec::new();
358-
let mut found = false;
359-
for worker in worker_map.values() {
360-
if worker.is_free() {
361-
selected_workers.push(worker.id);
362-
}
363-
if selected_workers.len() == n_nodes {
364-
found = true;
365-
break;
366-
}
367-
}
368-
if found {
369-
mn_queue.queue.pop();
370-
self.assign_multinode(worker_map, task, selected_workers);
371-
continue;
372379
} else {
373380
return;
374381
}
@@ -391,13 +398,13 @@ impl SchedulerState {
391398
compute_b_level_metric(core.task_map_mut())
392399
});
393400

394-
let (multi_node_queue, task_map, _) = core.multi_node_queue_split_mut();
401+
let (multi_node_queue, task_map, _, _) = core.multi_node_queue_split_mut();
395402
multi_node_queue.recompute_priorities(task_map);
396403
}
397404

398405
let multi_node_ready_tasks = core.take_multi_node_ready_to_assign();
399406
if !multi_node_ready_tasks.is_empty() {
400-
let (multi_node_queue, task_map, _) = core.multi_node_queue_split_mut();
407+
let (multi_node_queue, task_map, _, _) = core.multi_node_queue_split_mut();
401408
for task_id in multi_node_ready_tasks {
402409
if let Some(task) = task_map.find_task(task_id) {
403410
multi_node_queue.add_task(task)

crates/tako/src/internal/server/core.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ impl Core {
8282
WorkerId::new(self.worker_id_counter)
8383
}
8484

85+
pub fn worker_groups(&self) -> &Map<String, WorkerGroup> {
86+
&self.worker_groups
87+
}
88+
8589
#[inline]
8690
pub fn is_used_task_id(&self, task_id: TaskId) -> bool {
8791
task_id <= self.maximal_task_id
@@ -93,11 +97,17 @@ impl Core {
9397

9498
pub(crate) fn multi_node_queue_split_mut(
9599
&mut self,
96-
) -> (&mut MultiNodeQueue, &mut TaskMap, &mut WorkerMap) {
100+
) -> (
101+
&mut MultiNodeQueue,
102+
&mut TaskMap,
103+
&mut WorkerMap,
104+
&Map<String, WorkerGroup>,
105+
) {
97106
(
98107
&mut self.multi_node_queue,
99108
&mut self.tasks,
100109
&mut self.workers,
110+
&self.worker_groups,
101111
)
102112
}
103113

@@ -180,13 +190,13 @@ impl Core {
180190

181191
let worker_id = worker.id;
182192
if let Some(g) = self.worker_groups.get_mut(&worker.configuration.group) {
183-
g.worker_ids.insert(worker_id);
193+
g.new_worker(worker_id);
184194
} else {
185195
let mut worker_ids = Set::new();
186196
worker_ids.insert(worker_id);
187197
self.worker_groups.insert(
188198
worker.configuration.group.clone(),
189-
WorkerGroup { worker_ids },
199+
WorkerGroup::new(worker_ids),
190200
);
191201
}
192202
self.workers.insert(worker_id, worker);
@@ -198,8 +208,8 @@ impl Core {
198208
.worker_groups
199209
.get_mut(&worker.configuration.group)
200210
.unwrap();
201-
assert!(group.worker_ids.remove(&worker_id));
202-
if group.worker_ids.is_empty() {
211+
group.remove_worker(worker_id);
212+
if group.is_empty() {
203213
self.worker_groups.remove(&worker.configuration.group);
204214
}
205215
self.workers.remove(&worker_id).unwrap()
Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,31 @@
11
use crate::{Set, WorkerId};
22

33
pub struct WorkerGroup {
4-
pub worker_ids: Set<WorkerId>,
4+
worker_ids: Set<WorkerId>,
55
}
66

77
impl WorkerGroup {
8+
pub fn new(worker_ids: Set<WorkerId>) -> Self {
9+
WorkerGroup { worker_ids }
10+
}
11+
12+
pub fn worker_ids(&self) -> impl Iterator<Item = WorkerId> + '_ {
13+
self.worker_ids.iter().copied()
14+
}
15+
816
pub fn new_worker(&mut self, worker_id: WorkerId) {
917
assert!(self.worker_ids.insert(worker_id));
1018
}
1119

1220
pub fn remove_worker(&mut self, worker_id: WorkerId) {
1321
assert!(self.worker_ids.remove(&worker_id));
1422
}
23+
24+
pub fn size(&self) -> usize {
25+
self.worker_ids.len()
26+
}
27+
28+
pub fn is_empty(&self) -> bool {
29+
self.worker_ids.is_empty()
30+
}
1531
}

crates/tako/src/internal/tests/test_reactor.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,7 +1145,7 @@ fn test_worker_groups() {
11451145
create_test_workers(&mut core, &[1, 1]);
11461146
let g = core.worker_group("default").unwrap();
11471147
assert_eq!(
1148-
sorted_vec(g.worker_ids.iter().copied().collect()),
1148+
sorted_vec(g.worker_ids().collect()),
11491149
vec![WorkerId::new(100), WorkerId::new(101)]
11501150
);
11511151
let mut comm = create_test_comm();
@@ -1157,7 +1157,7 @@ fn test_worker_groups() {
11571157
);
11581158
let g = core.worker_group("default").unwrap();
11591159
assert_eq!(
1160-
sorted_vec(g.worker_ids.iter().copied().collect()),
1160+
sorted_vec(g.worker_ids().collect()),
11611161
vec![WorkerId::new(100)]
11621162
);
11631163
let mut comm = create_test_comm();

crates/tako/src/internal/tests/test_scheduler_mn.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@ use crate::internal::server::core::Core;
55
use crate::internal::server::task::Task;
66
use crate::internal::tests::utils::env::{create_test_comm, TestComm};
77
use crate::internal::tests::utils::schedule::{
8-
create_test_scheduler, create_test_worker, create_test_workers, finish_on_worker,
9-
submit_test_tasks,
8+
create_test_scheduler, create_test_worker, create_test_worker_config, create_test_workers,
9+
finish_on_worker, new_test_worker, submit_test_tasks,
1010
};
1111
use crate::internal::tests::utils::sorted_vec;
1212
use crate::internal::tests::utils::task::TaskBuilder;
13+
use crate::resources::{ResourceDescriptor, ResourceMap};
1314
use crate::{Priority, TaskId, WorkerId};
1415

1516
/*fn get_mn_placement(task: &Task) -> Vec<WorkerId> {
@@ -300,3 +301,37 @@ fn test_mn_sleep_wakeup_at_once() {
300301
assert!(core.task_map().get_task(1.into()).is_waiting());
301302
assert!(core.task_map().get_task(2.into()).is_mn_running());
302303
}
304+
305+
#[test]
306+
fn test_mn_schedule_on_groups() {
307+
let mut core = Core::default();
308+
309+
let worker_id = WorkerId::new(100);
310+
let mut wcfg1 = create_test_worker_config(worker_id, ResourceDescriptor::simple(1));
311+
wcfg1.group = "group1".to_string();
312+
new_test_worker(
313+
&mut core,
314+
worker_id,
315+
wcfg1,
316+
ResourceMap::from_vec(vec!["cpus".to_string()]),
317+
);
318+
319+
let worker_id = WorkerId::new(101);
320+
let mut wcfg2 = create_test_worker_config(worker_id, ResourceDescriptor::simple(1));
321+
wcfg2.group = "group2".to_string();
322+
new_test_worker(
323+
&mut core,
324+
worker_id,
325+
wcfg2,
326+
ResourceMap::from_vec(vec!["cpus".to_string()]),
327+
);
328+
329+
let mut comm = create_test_comm();
330+
let task1 = TaskBuilder::new(1).n_nodes(2).build();
331+
submit_test_tasks(&mut core, vec![task1]);
332+
333+
let mut scheduler = create_test_scheduler();
334+
scheduler.run_scheduling(&mut core, &mut comm);
335+
core.sanity_check();
336+
assert!(core.task_map().get_task(1.into()).is_waiting());
337+
}

crates/tako/src/internal/tests/utils/schedule.rs

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@ use crate::worker::{ServerLostPolicy, WorkerConfiguration};
1414
use crate::{TaskId, WorkerId};
1515
use std::time::{Duration, Instant};
1616

17-
pub fn create_test_worker(core: &mut Core, worker_id: WorkerId, cpus: u64) {
18-
let wcfg = WorkerConfiguration {
19-
resources: ResourceDescriptor::simple(cpus),
17+
pub fn create_test_worker_config(
18+
worker_id: WorkerId,
19+
resources: ResourceDescriptor,
20+
) -> WorkerConfiguration {
21+
WorkerConfiguration {
22+
resources,
2023
listen_address: format!("1.1.1.{}:123", worker_id),
2124
hostname: format!("test{}", worker_id),
2225
group: "default".to_string(),
@@ -28,14 +31,27 @@ pub fn create_test_worker(core: &mut Core, worker_id: WorkerId, cpus: u64) {
2831
time_limit: None,
2932
on_server_lost: ServerLostPolicy::Stop,
3033
extra: Default::default(),
31-
};
34+
}
35+
}
3236

33-
let worker = Worker::new(
37+
pub fn new_test_worker(
38+
core: &mut Core,
39+
worker_id: WorkerId,
40+
configuration: WorkerConfiguration,
41+
resource_map: ResourceMap,
42+
) {
43+
let worker = Worker::new(worker_id, configuration, resource_map);
44+
on_new_worker(core, &mut TestComm::default(), worker);
45+
}
46+
47+
pub fn create_test_worker(core: &mut Core, worker_id: WorkerId, cpus: u64) {
48+
let wcfg = create_test_worker_config(worker_id, ResourceDescriptor::simple(cpus));
49+
new_test_worker(
50+
core,
3451
worker_id,
3552
wcfg,
3653
ResourceMap::from_vec(vec!["cpus".to_string()]),
3754
);
38-
on_new_worker(core, &mut TestComm::default(), worker);
3955
}
4056

4157
pub fn create_test_workers(core: &mut Core, cpus: &[u64]) {

docs/deployment/worker.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,7 @@ If you also want to include workers that are offline (i.e. that have crashed or
159159
```bash
160160
$ hq worker info <worker-id>
161161
```
162+
163+
### Worker groups
164+
165+
Each worker is a member exactly of one group. Groups are used when multi-node tasks are used. See more [here](../jobs/multinode.md#groups)

docs/jobs/multinode.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,20 @@ Hostnames of all assigned nodes can be found in file which path is in
2828
environmental variable ``HQ_NODE_FILE``. Each line is now host name.
2929
The first line is always the root node.
3030

31-
3231
Note: Multi-node tasks always enables task directory (``--task-dir``).
3332

33+
## Groups
34+
35+
A multi-node task is started only on workers that belong to the same group.
36+
By default, workers are grouped by PBS/Slurm allocations and workers outside any allocation
37+
are put in "default" group.
38+
39+
A group of a worker can be specified at the start of the worker and it may be any string. Example:
40+
41+
```commandline
42+
$ hq worker start --group my_group
43+
```
44+
3445
## Running MPI tasks
3546

3647
A script that starts an MPI program in multi-node task may look like as follows:

0 commit comments

Comments
 (0)