Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fa00d12

Browse files
committed
Refactor worker timeout handling
1 parent 55101f4 commit fa00d12

11 files changed

Lines changed: 221 additions & 108 deletions

File tree

crates/hyperqueue/src/client/output/cli.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,12 @@ impl Output for CliOutput {
227227
reason: LostWorkerReason::Stopped,
228228
..
229229
}) => "STOPPED".cell().foreground_color(Some(Color::Magenta)),
230+
Some(WorkerExitInfo {
231+
reason: LostWorkerReason::TimeLimitReached,
232+
..
233+
}) => "TIME LIMIT REACHED"
234+
.cell()
235+
.foreground_color(Some(Color::Cyan)),
230236
},
231237
worker.configuration.hostname.cell(),
232238
worker.configuration.resources.summary(false).cell(),

crates/hyperqueue/src/client/output/quiet.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ impl Output for Quiet {
4242
reason: LostWorkerReason::IdleTimeout,
4343
..
4444
}) => "IDLE TIMEOUT",
45+
Some(WorkerExitInfo {
46+
reason: LostWorkerReason::TimeLimitReached,
47+
..
48+
}) => "TIME LIMIT REACHED",
4549
Some(WorkerExitInfo {
4650
reason: LostWorkerReason::Stopped,
4751
..

crates/hyperqueue/src/server/state.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use std::collections::BTreeMap;
22

33
use tako::messages::gateway::{
4-
CancelTasks, FromGatewayMessage, LostWorkerMessage, LostWorkerReason, NewWorkerMessage,
5-
TaskFailedMessage, TaskState, TaskUpdate, ToGatewayMessage,
4+
CancelTasks, FromGatewayMessage, LostWorkerMessage, NewWorkerMessage, TaskFailedMessage,
5+
TaskState, TaskUpdate, ToGatewayMessage,
66
};
77

88
use crate::server::autoalloc::AutoAllocState;
@@ -219,12 +219,7 @@ impl State {
219219
pub fn process_worker_lost(&mut self, msg: LostWorkerMessage) {
220220
log::debug!("Worker lost id={}", msg.worker_id);
221221
let worker = self.workers.get_mut(&msg.worker_id).unwrap();
222-
worker.set_offline_state(match msg.reason {
223-
LostWorkerReason::Stopped => LostWorkerReason::Stopped,
224-
LostWorkerReason::ConnectionLost => LostWorkerReason::ConnectionLost,
225-
LostWorkerReason::HeartbeatLost => LostWorkerReason::HeartbeatLost,
226-
LostWorkerReason::IdleTimeout => LostWorkerReason::IdleTimeout,
227-
});
222+
worker.set_offline_state(msg.reason.clone());
228223
for task_id in msg.running_tasks {
229224
let job = self.get_job_mut_by_tako_task_id(task_id).unwrap();
230225
job.set_waiting_state(task_id);

crates/tako/src/messages/common.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,17 @@ pub struct WorkerConfiguration {
9696
pub extra: Map<String, String>,
9797
}
9898

99+
/// This function is used from both the server and the worker to keep the same values
100+
/// in the worker configuration without the need for repeated configuration exchange.
101+
pub fn sync_worker_configuration(
102+
configuration: &mut WorkerConfiguration,
103+
server_idle_timeout: Option<Duration>,
104+
) {
105+
if configuration.idle_timeout.is_none() {
106+
configuration.idle_timeout = server_idle_timeout;
107+
}
108+
}
109+
99110
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
100111
pub struct CpuStats {
101112
pub cpu_per_core_percent_usage: Vec<f32>,

crates/tako/src/messages/gateway.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ pub enum LostWorkerReason {
198198
ConnectionLost,
199199
HeartbeatLost,
200200
IdleTimeout,
201+
TimeLimitReached,
201202
}
202203

203204
#[derive(Serialize, Deserialize, Debug)]

crates/tako/src/messages/worker.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ pub struct WorkerRegistrationResponse {
2626
pub worker_id: WorkerId,
2727
pub worker_addresses: Map<WorkerId, String>,
2828
pub resource_names: Vec<String>,
29+
pub server_idle_timeout: Option<Duration>,
2930
}
3031

3132
#[derive(Serialize, Deserialize, Debug)]
@@ -141,6 +142,12 @@ pub struct WorkerOverview {
141142
pub hw_state: Option<WorkerHwStateMessage>,
142143
}
143144

145+
#[derive(Serialize, Deserialize, Debug)]
146+
pub enum WorkerStopReason {
147+
IdleTimeout,
148+
TimeLimitReached,
149+
}
150+
144151
#[derive(Serialize, Deserialize, Debug)]
145152
//#[serde(tag = "op")]
146153
pub enum FromWorkerMessage {
@@ -151,4 +158,5 @@ pub enum FromWorkerMessage {
151158
StealResponse(StealResponseMsg),
152159
Overview(WorkerOverview),
153160
Heartbeat,
161+
Stop(WorkerStopReason),
154162
}

crates/tako/src/server/rpc.rs

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ use tokio::time::timeout;
1010
use tokio_util::codec::{Framed, LengthDelimitedCodec};
1111

1212
use crate::common::error::DsError;
13+
use crate::messages::common::sync_worker_configuration;
1314
use crate::messages::gateway::LostWorkerReason;
1415
use crate::messages::worker::{
1516
ConnectionRegistration, FromWorkerMessage, RegisterWorker, WorkerRegistrationResponse,
17+
WorkerStopReason,
1618
};
1719
use crate::server::comm::{Comm, CommSenderRef};
1820
use crate::server::core::CoreRef;
@@ -95,7 +97,7 @@ pub async fn worker_authentication(
9597

9698
let message_data = timeout(Duration::from_secs(15), reader.next())
9799
.await
98-
.map_err(|_| "Worker registration did not arrived")?
100+
.map_err(|_| "Worker registration did not arrive")?
99101
.ok_or_else(|| {
100102
DsError::from("The remote side closed connection without worker registration")
101103
})??;
@@ -133,10 +135,8 @@ async fn worker_rpc_loop(
133135
assert!(heartbeat_interval.as_millis() > 150);
134136

135137
let mut configuration = msg.configuration;
136-
// Update idle_timeout configuration from server default
137-
if configuration.idle_timeout.is_none() {
138-
configuration.idle_timeout = *core_ref.get().idle_timeout();
139-
}
138+
sync_worker_configuration(&mut configuration, *core_ref.get().idle_timeout());
139+
140140
let idle_timeout = configuration.idle_timeout;
141141
let (queue_sender, queue_receiver) = tokio::sync::mpsc::unbounded_channel::<Bytes>();
142142

@@ -158,6 +158,7 @@ async fn worker_rpc_loop(
158158
worker_id,
159159
worker_addresses: core_ref.get().get_worker_addresses(),
160160
resource_names: core_ref.get().create_resource_map().into_vec(),
161+
server_idle_timeout: *core_ref.get().idle_timeout(),
161162
};
162163
queue_sender
163164
.send(serialize(&message).unwrap().into())
@@ -179,22 +180,10 @@ async fn worker_rpc_loop(
179180
loop {
180181
interval.tick().await;
181182
let now = Instant::now();
182-
let mut core = core_ref.get_mut();
183-
let mut worker = core.get_worker_mut_by_id_or_panic(worker_id);
183+
let core = core_ref.get();
184+
let worker = core.get_worker_by_id_or_panic(worker_id);
184185
let elapsed = now - worker.last_heartbeat;
185186

186-
if let Some(timeout) = worker.configuration.idle_timeout {
187-
if worker.tasks().is_empty() {
188-
let elapsed = now - worker.last_occupied;
189-
if elapsed > timeout {
190-
log::debug!("Idle timeout, worker={}", worker.id);
191-
break LostWorkerReason::IdleTimeout;
192-
}
193-
} else {
194-
worker.last_occupied = now;
195-
}
196-
}
197-
198187
if elapsed > heartbeat_interval * 2 {
199188
log::debug!("Heartbeat not arrived, worker={}", worker.id);
200189
break LostWorkerReason::HeartbeatLost;
@@ -203,9 +192,16 @@ async fn worker_rpc_loop(
203192
};
204193

205194
let reason = tokio::select! {
206-
e = worker_receive_loop(core_ref.clone(), comm_ref.clone(), worker_id, connection.receiver, connection.opener) => {
207-
log::debug!("Receive loop terminated ({:?}), worker={}", e, worker_id);
208-
LostWorkerReason::ConnectionLost
195+
result = worker_receive_loop(core_ref.clone(), comm_ref.clone(), worker_id, connection.receiver, connection.opener) => {
196+
log::debug!("Receive loop terminated ({result:?}), worker={worker_id}");
197+
if let Ok(Some(reason)) = result {
198+
match reason {
199+
WorkerStopReason::IdleTimeout => LostWorkerReason::IdleTimeout,
200+
WorkerStopReason::TimeLimitReached => LostWorkerReason::TimeLimitReached
201+
}
202+
} else {
203+
LostWorkerReason::ConnectionLost
204+
}
209205
}
210206
e = snd_loop => {
211207
log::debug!("Sending loop terminated: {:?}, worker={}", e, worker_id);
@@ -245,7 +241,7 @@ pub async fn worker_receive_loop<
245241
worker_id: WorkerId,
246242
mut receiver: Reader,
247243
mut opener: Option<StreamOpener>,
248-
) -> crate::Result<()> {
244+
) -> crate::Result<Option<WorkerStopReason>> {
249245
while let Some(message) = receiver.next().await {
250246
let message: FromWorkerMessage = open_message(&mut opener, &message?)?;
251247
let mut core = core_ref.get_mut();
@@ -275,7 +271,10 @@ pub async fn worker_receive_loop<
275271
FromWorkerMessage::Overview(overview) => {
276272
comm.send_client_worker_overview(overview);
277273
}
274+
FromWorkerMessage::Stop(reason) => {
275+
return Ok(Some(reason));
276+
}
278277
}
279278
}
280-
Ok(())
279+
Ok(None)
281280
}

crates/tako/src/server/worker.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,11 @@ pub struct Worker {
3737

3838
// COLD DATA move it into a box (?)
3939
pub last_heartbeat: std::time::Instant,
40-
pub last_occupied: std::time::Instant,
4140
pub configuration: WorkerConfiguration,
4241
}
4342

4443
impl fmt::Debug for Worker {
4544
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
46-
//let task_ids : Vec<_> = self.tasks.iter().map(|r| r.get().id.to_string()).collect();
4745
f.debug_struct("Worker")
4846
.field("id", &self.id)
4947
.field("resources", &self.configuration.resources)
@@ -171,7 +169,6 @@ impl Worker {
171169
tasks: Default::default(),
172170
flags: WorkerFlags::empty(),
173171
last_heartbeat: now,
174-
last_occupied: now,
175172
}
176173
}
177174
}

0 commit comments

Comments
 (0)