Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d1cfd64

Browse files
committed
Handle transition from waiting to failed state in HQ
This can happen if the task failed to start (e.g. if stdout/stderr file path preparation has failed).
1 parent e7aa4fb commit d1cfd64

7 files changed

Lines changed: 81 additions & 23 deletions

File tree

crates/hyperqueue/src/client/output/cli.rs

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use crate::common::env::is_hq_env;
1111
use crate::common::format::{human_duration, human_mem_amount, human_size};
1212
use crate::common::manager::info::GetManagerInfo;
1313
use crate::server::autoalloc::{Allocation, AllocationState};
14-
use crate::server::job::{JobTaskCounters, JobTaskInfo, JobTaskState, StartedTaskData};
14+
use crate::server::job::{JobTaskCounters, JobTaskInfo, JobTaskState};
1515
use crate::stream::reader::logfile::Summary;
1616
use crate::transfer::messages::{
1717
AutoAllocListResponse, JobDescription, JobDetail, JobInfo, PinMode, QueueData, QueueState,
@@ -176,14 +176,20 @@ impl CliOutput {
176176
.iter()
177177
.filter_map(|t: &JobTaskInfo| match &t.state {
178178
JobTaskState::Failed {
179-
started_data: StartedTaskData { worker_ids, .. },
179+
started_data,
180180
error,
181181
..
182-
} => Some(vec![
183-
t.task_id.cell(),
184-
format_workers(worker_ids, worker_map).cell(),
185-
error.to_owned().cell().foreground_color(Some(Color::Red)),
186-
]),
182+
} => {
183+
let worker_ids = started_data
184+
.as_ref()
185+
.map(|data| data.worker_ids.as_slice())
186+
.unwrap_or(&[]);
187+
Some(vec![
188+
t.task_id.cell(),
189+
format_workers(worker_ids, worker_map).cell(),
190+
error.to_owned().cell().foreground_color(Some(Color::Red)),
191+
])
192+
}
187193
_ => None,
188194
})
189195
.take(SHOWN_TASKS)
@@ -1262,14 +1268,17 @@ fn get_task_time(state: &JobTaskState) -> (Option<DateTime<Utc>>, Option<DateTim
12621268
..
12631269
}
12641270
| JobTaskState::Failed {
1265-
started_data,
1271+
started_data: Some(started_data),
12661272
end_date,
12671273
..
12681274
} => (Some(started_data.start_date), Some(*end_date)),
12691275
JobTaskState::Canceled {
12701276
started_data: None,
12711277
cancelled_date: _,
12721278
}
1279+
| JobTaskState::Failed {
1280+
started_data: None, ..
1281+
}
12731282
| JobTaskState::Waiting => (None, None),
12741283
}
12751284
}

crates/hyperqueue/src/client/output/common.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ pub fn resolve_task_paths(job: &JobDetail, server_uid: &str) -> TaskToPathsMap {
4141
}
4242
| JobTaskState::Running { started_data, .. }
4343
| JobTaskState::Finished { started_data, .. }
44-
| JobTaskState::Failed { started_data, .. } => {
44+
| JobTaskState::Failed {
45+
started_data: Some(started_data),
46+
..
47+
} => {
4548
let ctx = CompletePlaceholderCtx {
4649
job_id: job.info.id,
4750
task_id: task.task_id,

crates/hyperqueue/src/client/output/json.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,9 @@ fn format_tasks(tasks: Vec<JobTaskInfo>, map: TaskToPathsMap) -> serde_json::Val
342342
end_date,
343343
error,
344344
} => {
345-
fill_task_started_data(&mut data, started_data);
345+
if let Some(started_data) = started_data {
346+
fill_task_started_data(&mut data, started_data);
347+
}
346348
data["finished_at"] = format_datetime(end_date);
347349
data["error"] = error.into();
348350
}

crates/hyperqueue/src/server/job.rs

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pub enum JobTaskState {
3838
end_date: DateTime<Utc>,
3939
},
4040
Failed {
41-
started_data: StartedTaskData,
41+
started_data: Option<StartedTaskData>,
4242
end_date: DateTime<Utc>,
4343
error: String,
4444
},
@@ -52,9 +52,9 @@ impl JobTaskState {
5252
pub fn started_data(&self) -> Option<&StartedTaskData> {
5353
match self {
5454
JobTaskState::Running { started_data, .. }
55-
| JobTaskState::Finished { started_data, .. }
56-
| JobTaskState::Failed { started_data, .. } => Some(started_data),
57-
JobTaskState::Canceled { started_data, .. } => started_data.as_ref(),
55+
| JobTaskState::Finished { started_data, .. } => Some(started_data),
56+
JobTaskState::Failed { started_data, .. }
57+
| JobTaskState::Canceled { started_data, .. } => started_data.as_ref(),
5858
_ => None,
5959
}
6060
}
@@ -334,20 +334,29 @@ impl Job {
334334
}
335335

336336
pub fn set_failed_state(&mut self, tako_task_id: TakoTaskId, error: String, backend: &Backend) {
337-
let (_, state) = self.get_task_state_mut(tako_task_id);
337+
let (task_id, state) = self.get_task_state_mut(tako_task_id);
338338
let now = Utc::now();
339339
match state {
340340
JobTaskState::Running { started_data } => {
341341
*state = JobTaskState::Failed {
342342
error,
343-
started_data: started_data.clone(),
343+
started_data: Some(started_data.clone()),
344344
end_date: now,
345345
};
346+
346347
self.counters.n_running_tasks -= 1;
347-
self.counters.n_failed_tasks += 1;
348348
}
349-
_ => panic!("Invalid worker state, expected Running, got {state:?}"),
349+
JobTaskState::Waiting => {
350+
*state = JobTaskState::Failed {
351+
error,
352+
started_data: None,
353+
end_date: now,
354+
}
355+
}
356+
_ => panic!("Invalid task {task_id} state, expected Running or Waiting, got {state:?}"),
350357
}
358+
self.counters.n_failed_tasks += 1;
359+
351360
self.check_termination(backend, now);
352361
}
353362

crates/hyperqueue/src/worker/start.rs

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ impl TaskLauncher for HqTaskLauncher {
9494
pin_program(&mut program, launch_ctx.allocation(), pin_mode, &launch_ctx)?;
9595

9696
let task_dir = if task_dir {
97-
let task_dir = TempDir::new_in(&launch_ctx.worker_configuration().work_dir, "t")?;
97+
let task_dir = TempDir::new_in(&launch_ctx.worker_configuration().work_dir, "t")
98+
.map_err(|error| {
99+
format!(
100+
"Cannot create task_dir in worker's workdir at {}: {error:?}",
101+
launch_ctx.worker_configuration().work_dir.display()
102+
)
103+
})?;
98104
program.env.insert(
99105
HQ_TASK_DIR.into(),
100106
task_dir.path().to_string_lossy().to_string().into(),
@@ -108,7 +114,12 @@ impl TaskLauncher for HqTaskLauncher {
108114
);
109115
if !launch_ctx.node_list().is_empty() {
110116
let filename = task_dir.path().join("hq-nodelist");
111-
write_node_file(&launch_ctx, &filename)?;
117+
write_node_file(&launch_ctx, &filename).map_err(|error| {
118+
format!(
119+
"Cannot write node file at {}: {error:?}",
120+
filename.display()
121+
)
122+
})?;
112123
program.env.insert(
113124
HQ_NODE_FILE.into(),
114125
filename.to_string_lossy().to_string().into(),
@@ -140,8 +151,18 @@ impl TaskLauncher for HqTaskLauncher {
140151
let paths = ResolvablePaths::from_program_def(&mut program);
141152
fill_placeholders_in_paths(paths, ctx);
142153

143-
create_directory_if_needed(&program.stdout)?;
144-
create_directory_if_needed(&program.stderr)?;
154+
create_directory_if_needed(&program.stdout).map_err(|error| {
155+
format!(
156+
"Cannot create stdout directory at {:?}: {error:?}",
157+
program.stdout
158+
)
159+
})?;
160+
create_directory_if_needed(&program.stderr).map_err(|error| {
161+
format!(
162+
"Cannot create stderr directory at {:?}: {error:?}",
163+
program.stdout
164+
)
165+
})?;
145166

146167
(program, job_id, task_id, launch_ctx.instance_id(), task_dir)
147168
};

tests/conftest.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,13 @@ def start_worker(
164164
wait_for_start=True,
165165
on_server_lost="stop",
166166
server_dir=None,
167+
work_dir: Optional[str] = None,
167168
) -> subprocess.Popen:
168169
self.id_counter += 1
169170
worker_id = self.id_counter
170171
worker_env = self.make_default_env()
171-
work_dir = f"workdir{worker_id}"
172+
if work_dir is None:
173+
work_dir = f"workdir{worker_id}"
172174

173175
if env:
174176
worker_env.update(env)

tests/test_job.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,18 @@ def stop_worker(worker_process):
14311431
check_child_process_exited(hq_env, stop_worker)
14321432

14331433

1434+
def test_fail_to_start_issue629(hq_env: HqEnv, tmpdir):
1435+
"""
1436+
Regression test for https://github.com/It4innovations/hyperqueue/issues/629.
1437+
By using an invalid stdout path, we should cause task spawning to fail, which should be handled gracefully.
1438+
"""
1439+
hq_env.start_server()
1440+
hq_env.start_worker()
1441+
1442+
hq_env.command(["submit", "--stdout=/dev/null/foo.txt", "ls"])
1443+
wait_for_job_state(hq_env, 1, "FAILED")
1444+
1445+
14341446
def check_child_process_exited(hq_env: HqEnv, stop_fn: Callable[[subprocess.Popen], None]):
14351447
"""
14361448
Creates a task that spawns a child, and then calls `stop_fn`, which should kill either the task

0 commit comments

Comments
 (0)