From 92098db0183f2cefe30aeec5d54413d6420c4b15 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Thu, 25 Mar 2021 09:28:28 -0400 Subject: [PATCH] crio wipe: only completely wipe storage after a reboot We created the CleanShutdownFile to work around issues where nodes didn't have time to sync changes to disk before rebooting. This caused an unclean reboot. However, for this to be relevant, the node had to actually reboot. If it didn't, we could fail to remove the storage because the containers are still running. In this case, we shouldn't wipe anything. Signed-off-by: Peter Hunt --- cmd/crio/wipe.go | 20 ++++++++++++-------- test/crio-wipe.bats | 29 +++++++++++++++-------------- test/helpers.bash | 5 +++-- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cmd/crio/wipe.go b/cmd/crio/wipe.go index e283459227d..cf1d2d4e5d6 100644 --- a/cmd/crio/wipe.go +++ b/cmd/crio/wipe.go @@ -37,9 +37,18 @@ func crioWipe(c *cli.Context) error { return err } - // first, check whether crio has shutdown with time to sync - // if not, we should clear the storage directory - if config.CleanShutdownFile != "" { + // First, check if the node was rebooted. + // We know this happened because the VersionFile (which lives in a tmpfs) + // will not be there. + shouldWipeContainers, err := version.ShouldCrioWipe(config.VersionFile) + if err != nil { + logrus.Infof("checking whether cri-o should wipe containers: %v", err) + } + + // Then, check whether crio has shutdown with time to sync. + // Note: this is only needed if the node rebooted. + // If there wasn't time to sync, we should clear the storage directory + if shouldWipeContainers && config.CleanShutdownFile != "" { if _, err := os.Stat(config.CleanShutdownFile); err != nil { logrus.Infof("file %s not found. Wiping storage directory %s because of suspected dirty shutdown", config.CleanShutdownFile, store.GraphRoot()) // If we do not do this, we may leak other resources that are not directly in the graphroot. @@ -60,16 +69,11 @@ func crioWipe(c *cli.Context) error { } shouldWipeImages := true - shouldWipeContainers := true // First, check if we need to upgrade at all if !c.IsSet("force") { // there are two locations we check before wiping: // one in a temporary directory. This is to check whether the node has rebooted. // if so, we should remove containers - shouldWipeContainers, err = version.ShouldCrioWipe(config.VersionFile) - if err != nil { - logrus.Infof("%v: triggering wipe of containers", err.Error()) - } // another is needed in a persistent directory. This is to check whether we've upgraded // if we've upgraded, we should wipe images shouldWipeImages, err = version.ShouldCrioWipe(config.VersionFilePersist) diff --git a/test/crio-wipe.bats b/test/crio-wipe.bats index 6eb436d2760..624cae8dfe3 100644 --- a/test/crio-wipe.bats +++ b/test/crio-wipe.bats @@ -117,25 +117,12 @@ function start_crio_with_stopped_pod() { run_podman_with_args ps -a | grep test } -@test "don't clear everything when not asked to check shutdown" { - start_crio_with_stopped_pod - stop_crio_no_clean - - rm "$CONTAINER_CLEAN_SHUTDOWN_FILE" - - CONTAINER_CLEAN_SHUTDOWN_FILE="" run_crio_wipe - - start_crio_no_setup - - test_crio_did_not_wipe_containers - test_crio_did_not_wipe_images -} - @test "do clear everything when shutdown file not found" { start_crio_with_stopped_pod stop_crio_no_clean rm "$CONTAINER_CLEAN_SHUTDOWN_FILE" + rm "$CONTAINER_VERSION_FILE" run_crio_wipe @@ -158,6 +145,7 @@ function start_crio_with_stopped_pod() { run_podman_with_args stop -a rm "$CONTAINER_CLEAN_SHUTDOWN_FILE" + rm "$CONTAINER_VERSION_FILE" run_crio_wipe @@ -177,9 +165,22 @@ function start_crio_with_stopped_pod() { run_podman_with_args run --name test -d quay.io/crio/busybox:latest top rm "$CONTAINER_CLEAN_SHUTDOWN_FILE" + rm "$CONTAINER_VERSION_FILE" run "$CRIO_BINARY_PATH" --config "$CRIO_CONFIG" wipe echo "$status" echo "$output" [ "$status" -ne 0 ] } + +@test "don't clear containers on a forced restart of crio" { + start_crio_with_stopped_pod + stop_crio_no_clean "-9" || true + + run_crio_wipe + + start_crio_no_setup + + test_crio_did_not_wipe_containers + test_crio_did_not_wipe_images +} diff --git a/test/helpers.bash b/test/helpers.bash index e24686f1b49..1d1f16a3b8f 100644 --- a/test/helpers.bash +++ b/test/helpers.bash @@ -362,8 +362,9 @@ function cleanup_pods() { } function stop_crio_no_clean() { + local signal="$1" if [ -n "${CRIO_PID+x}" ]; then - kill "$CRIO_PID" >/dev/null 2>&1 + kill "$signal" "$CRIO_PID" >/dev/null 2>&1 || true wait "$CRIO_PID" unset CRIO_PID fi @@ -371,7 +372,7 @@ function stop_crio_no_clean() { # Stop crio. function stop_crio() { - stop_crio_no_clean + stop_crio_no_clean "" cleanup_network_conf }