Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docker/docker_jit_monitor/src/github_api.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::process::{Command, Output};
use std::process::{Command, ExitCode, Output};

use log::{debug, warn};
use serde::Deserialize;
Expand Down Expand Up @@ -97,6 +97,13 @@ pub(crate) fn spawn_runner(config: RunnerConfig) -> Result<DockerContainer, Spaw
.arg(encoded_jit_config);

let runner = cmd.spawn().map_err(SpawnRunnerError::SpawnDockerError)?;

// The above command will not give an error if the docker command exists
// but the command exited with failure.
std::thread::sleep(Duration::from_millis(100));
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this might need to be ~2 seconds. time reports around 1.1s before docker run non_existing_local_image --rm returns.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think perhaps we should move the retries.reset() to the place where we check if the container exited. Exit code 125 means docker run failed. Can we get the actual runtime from the exit status? If yes maybe besides checking the exit code we could also do a heuristic (shorter than 5 seconds probably means something went wrong)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok then I need to rething this a bit.

if runner.try_wait() == Ok(Some(ExitCode::FAILURE)) {
return Err(SpawnRunnerError::SpawnDockerExit);
}
Ok(DockerContainer {
name: config.name,
process: runner,
Expand Down
52 changes: 48 additions & 4 deletions docker/docker_jit_monitor/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::{
collections::HashMap,
process::{Child, Command},
string::FromUtf8Error,
sync::atomic::{AtomicU32, AtomicU64, Ordering},
Expand All @@ -15,6 +16,10 @@ use crate::github_api::{get_idle_runners, spawn_runner};

mod github_api;

/// This will currently have 10 tries with equates to 500 * \sum_{i=0}^10 2^i = 500*2^{11} -1 = 102400 = 1024 secs ~ 17 min.
const MAX_SPAWN_RETRIES: u32 = 10;
/// How long the loop will sleep in milliseconds.
const BASE_LOOP_SLEEP: u64 = 500;
Comment thread
Narfinger marked this conversation as resolved.
static RUNNER_ID: AtomicU64 = AtomicU64::new(0);
static EXITING: AtomicU32 = AtomicU32::new(0);

Expand Down Expand Up @@ -133,6 +138,8 @@ enum SpawnRunnerError {
EncodedJitConfigNotFound,
#[error("Failed to spawn docker with IoError: `{0:?}`")]
SpawnDockerError(std::io::Error),
#[error("Docker command returned failure error code")]
SpawnDockerExit,
#[error("Couldn't find any hdc devices")]
NoHdcDeviceFound,
#[error("Failed to list USB devices")]
Expand All @@ -143,7 +150,7 @@ enum SpawnRunnerError {
#[cfg(target_os = "linux")]
const OS_TAG: &str = "Linux";

#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
enum ContainerType {
Builder,
Runner,
Expand Down Expand Up @@ -187,7 +194,7 @@ impl Iterator for ContainerTypeIterator {
None
}
};
self.current.clone()
self.current
}
}

Expand All @@ -207,6 +214,38 @@ struct DockerContainer {
container_type: ContainerType,
}

#[derive(Debug, Default)]
/// Store the number of retries per container type
struct Retries {
t: HashMap<ContainerType, u32>,
}

impl Retries {
/// Increases the number of retries and quits if it reaches `MAX_SPAWN_RETRIES`.
fn inc_and_check(&mut self, t: ContainerType) {
let value = self.t.entry(t).or_insert(0);
*value += 1;
if *value > MAX_SPAWN_RETRIES {
println!(
"We had {value} many times to spawn a runner/builder ({t:?}). It is not happening."
);
std::process::exit(-1);
}
}

/// Resets the counter when we have succesfully spawned a runner.
fn reset(&mut self, t: ContainerType) {
self.t.entry(t).insert_entry(0);
}

/// The current wait time we have for a loop.
/// Defaults to `BASE_LOOP_SLEEP` and exponentially increases with failures.
fn wait_time(&self) -> Duration {
let m = self.t.values().max().unwrap_or(&0);
Duration::from_millis(BASE_LOOP_SLEEP * 2_u64.pow(*m))
}
}

fn main() -> anyhow::Result<()> {
env_logger::init();
info!("Starting monitor for selfhosted docker-based github runners!");
Expand All @@ -232,6 +271,7 @@ fn main() -> anyhow::Result<()> {
let mut running_containers: Vec<DockerContainer> = vec![];
// Todo: implement something to reserve devices for the duration of the docker run child process.

let mut retries = Retries::default();
loop {
let exiting = EXITING.load(Ordering::Relaxed);
for container_type in ContainerType::iter() {
Expand All @@ -254,13 +294,17 @@ fn main() -> anyhow::Result<()> {
};

match spawn_runner(config) {
Ok(container) => running_containers.push(container),
Ok(container) => {
retries.reset(container_type);
running_containers.push(container)
}
Err(SpawnRunnerError::GhApiError(_, message))
if message.contains("gh: Already exists") =>
{
info!("Runner name already taken - Will retry with new name later")
}
Err(e) => {
retries.inc_and_check(container_type);
error!("Failed to spawn JIT runner: {e:?}");
}
}
Expand Down Expand Up @@ -319,7 +363,7 @@ fn main() -> anyhow::Result<()> {
}

running_containers = still_running;
thread::sleep(Duration::from_millis(500));
thread::sleep(retries.wait_time());
}

info!("Exiting....");
Expand Down