diff --git a/cmd/crio/main.go b/cmd/crio/main.go index 97a425735fd..32c5fbbc5bd 100644 --- a/cmd/crio/main.go +++ b/cmd/crio/main.go @@ -149,11 +149,12 @@ func main() { app.Commands = criocli.DefaultCommands app.Commands = append(app.Commands, []*cli.Command{ + criocli.CheckCommand, criocli.ConfigCommand, criocli.PublishCommand, + criocli.StatusCommand, criocli.VersionCommand, criocli.WipeCommand, - criocli.StatusCommand, }...) app.Before = func(c *cli.Context) (err error) { diff --git a/completions/bash/crio b/completions/bash/crio index 04346c44ed5..fe37275a5fb 100755 --- a/completions/bash/crio +++ b/completions/bash/crio @@ -7,10 +7,11 @@ completion man markdown md +check config +status version wipe -status help h --absent-mount-sources-to-reject diff --git a/completions/fish/crio.fish b/completions/fish/crio.fish index fa962f9c69b..c6772c37e8d 100644 --- a/completions/fish/crio.fish +++ b/completions/fish/crio.fish @@ -2,7 +2,7 @@ function __fish_crio_no_subcommand --description 'Test if there has been any subcommand yet' for i in (commandline -opc) - if contains -- $i complete completion help h man markdown md config version wipe status config c containers container cs s info i help h + if contains -- $i complete completion help h man markdown md check config status config c containers container cs s info i version wipe help h return 1 end end @@ -182,6 +182,31 @@ complete -c crio -n '__fish_seen_subcommand_from man' -f -l help -s h -d 'show h complete -r -c crio -n '__fish_crio_no_subcommand' -a 'man' -d 'Generate the man page documentation.' complete -c crio -n '__fish_seen_subcommand_from markdown md' -f -l help -s h -d 'show help' complete -r -c crio -n '__fish_crio_no_subcommand' -a 'markdown md' -d 'Generate the markdown documentation.' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l help -s h -d 'show help' +complete -r -c crio -n '__fish_crio_no_subcommand' -a 'check' -d 'Check CRI-O storage directory for errors. + +This command can also repair damaged containers, images and layers. + +By default, the data integrity of the storage directory is verified, +which can be an I/O and CPU-intensive operation. The --quick option +can be used to reduce the number of checks run. + +When using the --repair option, especially with the --force option, +CRI-O and any currently running containers should be stopped if +possible to ensure no concurrent access to the storage directory +occurs. + +The --wipe option can be used to automatically attempt to remove +containers and images on a repair failure. This option, combined +with the --force option, can be used to entirely remove the storage +directory content in case of irrecoverable errors. This should be +used as a last resort, and similarly to the --repair option, it\'s +best if CRI-O and any currently running containers are stopped.' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l age -s a -r -d 'Maximum allowed age for unreferenced layers' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l force -s f -d 'Remove damaged containers' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l repair -s r -d 'Remove damaged images and layers' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l quick -s q -d 'Perform only quick checks' +complete -c crio -n '__fish_seen_subcommand_from check' -f -l wipe -s w -d 'Wipe storage directory on repair failure' complete -c crio -n '__fish_seen_subcommand_from config' -f -l help -s h -d 'show help' complete -r -c crio -n '__fish_crio_no_subcommand' -a 'config' -d 'Outputs a commented version of the configuration file that could be used by CRI-O. This allows you to save you current configuration setup and then load @@ -202,13 +227,6 @@ complete -c crio -n '__fish_seen_subcommand_from config' -f -l migrate-defaults defaults between versions. To save a custom configuration change, it should be in a drop-in configuration file instead. Possible values: "1.17"' -complete -c crio -n '__fish_seen_subcommand_from version' -f -l help -s h -d 'show help' -complete -r -c crio -n '__fish_crio_no_subcommand' -a 'version' -d 'display detailed version information' -complete -c crio -n '__fish_seen_subcommand_from version' -f -l json -s j -d 'print JSON instead of text' -complete -c crio -n '__fish_seen_subcommand_from version' -f -l verbose -s v -d 'print verbose information (for example all golang dependencies)' -complete -c crio -n '__fish_seen_subcommand_from wipe' -f -l help -s h -d 'show help' -complete -r -c crio -n '__fish_crio_no_subcommand' -a 'wipe' -d 'wipe CRI-O\'s container and image storage' -complete -c crio -n '__fish_seen_subcommand_from wipe' -f -l force -s f -d 'force wipe by skipping the version check' complete -c crio -n '__fish_seen_subcommand_from status' -f -l help -s h -d 'show help' complete -r -c crio -n '__fish_crio_no_subcommand' -a 'status' -d 'Display status information' complete -c crio -n '__fish_seen_subcommand_from status' -l socket -s s -r -d 'absolute path to the unix socket' @@ -219,5 +237,12 @@ complete -r -c crio -n '__fish_seen_subcommand_from status' -a 'containers conta complete -c crio -n '__fish_seen_subcommand_from containers container cs s' -f -l id -s i -r -d 'the container ID' complete -c crio -n '__fish_seen_subcommand_from info i' -f -l help -s h -d 'show help' complete -r -c crio -n '__fish_seen_subcommand_from status' -a 'info i' -d 'Retrieve generic information about CRI-O, such as the cgroup and storage driver.' +complete -c crio -n '__fish_seen_subcommand_from version' -f -l help -s h -d 'show help' +complete -r -c crio -n '__fish_crio_no_subcommand' -a 'version' -d 'display detailed version information' +complete -c crio -n '__fish_seen_subcommand_from version' -f -l json -s j -d 'print JSON instead of text' +complete -c crio -n '__fish_seen_subcommand_from version' -f -l verbose -s v -d 'print verbose information (for example all golang dependencies)' +complete -c crio -n '__fish_seen_subcommand_from wipe' -f -l help -s h -d 'show help' +complete -r -c crio -n '__fish_crio_no_subcommand' -a 'wipe' -d 'wipe CRI-O\'s container and image storage' +complete -c crio -n '__fish_seen_subcommand_from wipe' -f -l force -s f -d 'force wipe by skipping the version check' complete -c crio -n '__fish_seen_subcommand_from help h' -f -l help -s h -d 'show help' complete -r -c crio -n '__fish_crio_no_subcommand' -a 'help h' -d 'Shows a list of commands or help for one command' diff --git a/completions/zsh/_crio b/completions/zsh/_crio index c841f871ec6..623a8b5f67f 100644 --- a/completions/zsh/_crio +++ b/completions/zsh/_crio @@ -7,12 +7,31 @@ _cli_zsh_autocomplete() { 'man:Generate the man page documentation.' 'markdown:Generate the markdown documentation.' 'md:Generate the markdown documentation.' + "check:Check CRI-O storage directory for errors. + +This command can also repair damaged containers, images and layers. + +By default, the data integrity of the storage directory is verified, +which can be an I/O and CPU-intensive operation. The --quick option +can be used to reduce the number of checks run. + +When using the --repair option, especially with the --force option, +CRI-O and any currently running containers should be stopped if +possible to ensure no concurrent access to the storage directory +occurs. + +The --wipe option can be used to automatically attempt to remove +containers and images on a repair failure. This option, combined +with the --force option, can be used to entirely remove the storage +directory content in case of irrecoverable errors. This should be +used as a last resort, and similarly to the --repair option, it's +best if CRI-O and any currently running containers are stopped." 'config:Outputs a commented version of the configuration file that could be used by CRI-O. This allows you to save you current configuration setup and then load it later with **--config**. Global options will modify the output.' + 'status:Display status information' 'version:display detailed version information' "wipe:wipe CRI-O's container and image storage" - 'status:Display status information' 'help:Shows a list of commands or help for one command' 'h:Shows a list of commands or help for one command' ) diff --git a/contrib/test/ci/integration.yml b/contrib/test/ci/integration.yml index 6cb9b5838a9..df0abaa9f59 100644 --- a/contrib/test/ci/integration.yml +++ b/contrib/test/ci/integration.yml @@ -58,6 +58,7 @@ + ['config.bats'] | product(kata_skip_config_tests) \ + ['config_migrate.bats'] | product(kata_skip_config_migrate_tests) \ + ['reload_config.bats'] | product(kata_skip_reload_config) \ + + ['crio-check.bats'] | product(kata_skip_crio_check_tests) \ + ['crio-wipe.bats'] | product(kata_skip_crio_wipe_tests) \ + ['ctr.bats'] | product(kata_skip_ctr_tests) \ + ['devices.bats'] | product(kata_skip_devices_tests) \ diff --git a/contrib/test/ci/vars.yml b/contrib/test/ci/vars.yml index b0756676ffb..17faf06a299 100644 --- a/contrib/test/ci/vars.yml +++ b/contrib/test/ci/vars.yml @@ -112,12 +112,15 @@ kata_skip_config_migrate_tests: - 'test "config migrate should succeed with 1.17 config"' kata_skip_reload_config: - 'test "reload config should remove pinned images when an empty list is provided"' +kata_skip_crio_check_tests: + - 'test "storage directory check should wipe everything on repair errors"' kata_skip_crio_wipe_tests: - 'test "clear neither when remove persist"' - "test \"don't clear containers on a forced restart of crio\"" - "test \"don't clear containers if clean shutdown supported file not present\"" - "test \"internal_wipe don't clear containers on a forced restart of crio\"" - 'test "internal_wipe eventually cleans network on forced restart of crio if network is slow to come up"' + - 'test "recover from badly corrupted storage directory"' kata_skip_ctr_tests: - 'test "ctr logging"' - 'test "ctr journald logging"' diff --git a/docs/crio.8.md b/docs/crio.8.md index ad7ef8b280e..eec6126fe1e 100644 --- a/docs/crio.8.md +++ b/docs/crio.8.md @@ -462,6 +462,38 @@ Generate the markdown documentation. Shows a list of commands or help for one command +## check + +Check CRI-O storage directory for errors. + +This command can also repair damaged containers, images and layers. + +By default, the data integrity of the storage directory is verified, +which can be an I/O and CPU-intensive operation. The --quick option +can be used to reduce the number of checks run. + +When using the --repair option, especially with the --force option, +CRI-O and any currently running containers should be stopped if +possible to ensure no concurrent access to the storage directory +occurs. + +The --wipe option can be used to automatically attempt to remove +containers and images on a repair failure. This option, combined +with the --force option, can be used to entirely remove the storage +directory content in case of irrecoverable errors. This should be +used as a last resort, and similarly to the --repair option, it's +best if CRI-O and any currently running containers are stopped. + +**--age, -a**="": Maximum allowed age for unreferenced layers (default: "24h") + +**--force, -f**: Remove damaged containers + +**--quick, -q**: Perform only quick checks + +**--repair, -r**: Remove damaged images and layers + +**--wipe, -w**: Wipe storage directory on repair failure + ## config Outputs a commented version of the configuration file that could be used @@ -486,20 +518,6 @@ it later with **--config**. Global options will modify the output. be in a drop-in configuration file instead. Possible values: "1.17" (default: "1.17") -## version - -display detailed version information - -**--json, -j**: print JSON instead of text - -**--verbose, -v**: print verbose information (for example all golang dependencies) - -## wipe - -wipe CRI-O's container and image storage - -**--force, -f**: force wipe by skipping the version check - ## status Display status information @@ -520,6 +538,20 @@ Display detailed information about the provided container ID. Retrieve generic information about CRI-O, such as the cgroup and storage driver. +## version + +display detailed version information + +**--json, -j**: print JSON instead of text + +**--verbose, -v**: print verbose information (for example all golang dependencies) + +## wipe + +wipe CRI-O's container and image storage + +**--force, -f**: force wipe by skipping the version check + ## help, h Shows a list of commands or help for one command diff --git a/internal/criocli/check.go b/internal/criocli/check.go new file mode 100644 index 00000000000..390eb343fa4 --- /dev/null +++ b/internal/criocli/check.go @@ -0,0 +1,195 @@ +package criocli + +import ( + "fmt" + + "github.com/containers/storage" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" + + "github.com/cri-o/cri-o/internal/lib" + "github.com/cri-o/cri-o/utils" +) + +type checkErrors map[string][]error + +var CheckCommand = &cli.Command{ + Name: "check", + Usage: usageText, + Action: crioCheck, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "age", + Aliases: []string{"a"}, + Value: "24h", + Usage: "Maximum allowed age for unreferenced layers", + }, + &cli.BoolFlag{ + Name: "force", + Aliases: []string{"f"}, + Usage: "Remove damaged containers", + }, + &cli.BoolFlag{ + Name: "repair", + Aliases: []string{"r"}, + Usage: "Remove damaged images and layers", + }, + &cli.BoolFlag{ + Name: "quick", + Aliases: []string{"q"}, + Usage: "Perform only quick checks", + }, + &cli.BoolFlag{ + Name: "wipe", + Aliases: []string{"w"}, + Usage: "Wipe storage directory on repair failure", + }, + }, +} + +func crioCheck(c *cli.Context) error { + config, err := GetConfigFromContext(c) + if err != nil { + return fmt.Errorf("unable to load configuration: %w", err) + } + + store, err := config.GetStore() + if err != nil { + return fmt.Errorf("unable to open storage: %w", err) + } + defer func() { + if _, err := store.Shutdown(true); err != nil { + logrus.Errorf("Unable to shutdown storage: %v", err) + } + }() + + graphRoot := store.GraphRoot() + logrus.Infof("Checking storage directory %s for errors", graphRoot) + + checkOptions := storage.CheckEverything() + if c.Bool("quick") { + // This is not the same as the "quick" check that CRI-O performs during its start-up + // following an unclean shutdown, as this one would set the `LayerDigests` option, + // which is I/O and CPU intensive, whereas the other one does not. + checkOptions = storage.CheckMost() + } + + // The maximum unreferenced layer age. + layerAge := c.String("age") + if layerAge != "" { + age, err := utils.ParseDuration(layerAge) + if err != nil { + return fmt.Errorf("unable to parse age duration: %w", err) + } + checkOptions.LayerUnreferencedMaximumAge = &age + } + + report, err := store.Check(checkOptions) + if err != nil { + return fmt.Errorf("unable to check storage: %w", err) + } + + // Walk each report and show details... + for prefix, checkReport := range map[string]checkErrors{ + "layer": report.Layers, + "read-only layer": report.ROLayers, + "image": report.Images, + "read-only image": report.ROImages, + "container": report.Containers, + } { + for identifier, errs := range checkReport { + for _, err := range errs { + logrus.Debugf("%s: %s: %v", prefix, identifier, err) + } + } + } + + seenStorageErrors := lib.CheckReportHasErrors(report) + logrus.Debugf("Storage directory %s has errors: %t", graphRoot, seenStorageErrors) + + if !c.Bool("repair") { + if seenStorageErrors { + logrus.Warnf("Errors found while checking storage directory %s for errors", graphRoot) + return fmt.Errorf( + "%d layer errors, %d read-only layer errors, %d image errors, %d read-only image errors, %d container errors", + len(report.Layers), + len(report.ROLayers), + len(report.Images), + len(report.ROImages), + len(report.Containers), + ) + } + return nil + } + + force := c.Bool("force") + if force { + logrus.Warn("The `force` option has been set, repair will attempt to remove damaged containers") + } + logrus.Infof("Attempting to repair storage directory %s", graphRoot) + + errs := store.Repair(report, &storage.RepairOptions{ + RemoveContainers: force, + }) + if len(errs) != 0 { + for _, err := range errs { + logrus.Error(err) + } + + if c.Bool("wipe") { + // Depending on whether the `force` option is set or not, this will remove the + // storage directory completely while ignoring any running containers. Otherwise, + // this will fail if there are any containers currently running. + if force { + logrus.Warn("The `force` option has been set, storage directory will be forcefully removed") + } + logrus.Infof("Wiping storage directory %s", graphRoot) + return lib.RemoveStorageDirectory(config, store, force) + } + + return errs[0] + } + + if len(report.ROLayers) > 0 || len(report.ROImages) > 0 || (!force && len(report.Containers) > 0) { + if force { + // Any damaged containers would have been deleted at this point. + return fmt.Errorf( + "%d read-only layer errors, %d read-only image errors", + len(report.ROLayers), + len(report.ROImages), + ) + } + return fmt.Errorf( + "%d read-only layer errors, %d read-only image errors, %d container errors", + len(report.ROLayers), + len(report.ROImages), + len(report.Containers), + ) + } + + return nil +} + +// The `Description` field will not be rendered when the documentation +// is generated, and using `Usage` makes the formatting wrong when the +// command-line help is rendered. Shell completions might also be +// incorrect. +var usageText = `Check CRI-O storage directory for errors. + +This command can also repair damaged containers, images and layers. + +By default, the data integrity of the storage directory is verified, +which can be an I/O and CPU-intensive operation. The --quick option +can be used to reduce the number of checks run. + +When using the --repair option, especially with the --force option, +CRI-O and any currently running containers should be stopped if +possible to ensure no concurrent access to the storage directory +occurs. + +The --wipe option can be used to automatically attempt to remove +containers and images on a repair failure. This option, combined +with the --force option, can be used to entirely remove the storage +directory content in case of irrecoverable errors. This should be +used as a last resort, and similarly to the --repair option, it's +best if CRI-O and any currently running containers are stopped.` diff --git a/internal/criocli/wipe.go b/internal/criocli/wipe.go index 96c0f2af789..54d242b9386 100644 --- a/internal/criocli/wipe.go +++ b/internal/criocli/wipe.go @@ -63,7 +63,13 @@ func crioWipe(c *cli.Context) error { // Note: this is only needed if the node rebooted. // If there wasn't time to sync, we should clear the storage directory if shouldWipeContainers && lib.ShutdownWasUnclean(config) { - return lib.HandleUncleanShutdown(config, store) + logrus.Infof( + "File %s not found. Wiping storage directory %s because of suspected unclean shutdown", + config.CleanShutdownFile, + store.GraphRoot(), + ) + // This will fail if there are any containers currently running. + return lib.RemoveStorageDirectory(config, store, false) } // If crio is configured to wipe internally (and `--force` wasn't set) diff --git a/internal/lib/container_server.go b/internal/lib/container_server.go index 6bef0a92015..ed7ceb34e82 100644 --- a/internal/lib/container_server.go +++ b/internal/lib/container_server.go @@ -12,6 +12,7 @@ import ( "github.com/containers/common/pkg/hooks" cstorage "github.com/containers/storage" "github.com/containers/storage/pkg/ioutils" + cmount "github.com/containers/storage/pkg/mount" "github.com/containers/storage/pkg/truncindex" "github.com/cri-o/cri-o/internal/hostport" "github.com/cri-o/cri-o/internal/lib/sandbox" @@ -104,20 +105,24 @@ func New(ctx context.Context, configIface libconfig.Iface) (*ContainerServer, er } if config.InternalRepair && ShutdownWasUnclean(config) { - checkOptions := cstorage.CheckEverything() - report, err := store.Check(checkOptions) - if err != nil { - err = HandleUncleanShutdown(config, store) - if err != nil { - return nil, err + graphRoot := store.GraphRoot() + log.Warnf(ctx, "Checking storage directory %s for errors because of unclean shutdown", graphRoot) + + wipeStorage := false + report, err := store.Check(checkQuick()) + if err == nil && CheckReportHasErrors(report) { + log.Warnf(ctx, "Attempting to repair storage directory %s because of unclean shutdown", graphRoot) + if errs := store.Repair(report, cstorage.RepairEverything()); len(errs) > 0 { + wipeStorage = true } + } else if err != nil { + // Storage check has failed with irrecoverable errors. + wipeStorage = true } - options := cstorage.RepairOptions{ - RemoveContainers: true, - } - if errs := store.Repair(report, &options); len(errs) > 0 { - err = HandleUncleanShutdown(config, store) - if err != nil { + if wipeStorage { + log.Warnf(ctx, "Wiping storage directory %s because of unclean shutdown", graphRoot) + // This will fail if there are any containers currently running. + if err := RemoveStorageDirectory(config, store, false); err != nil { return nil, err } } @@ -794,20 +799,82 @@ func ShutdownWasUnclean(config *libconfig.Config) bool { return true } -func HandleUncleanShutdown(config *libconfig.Config, store cstorage.Store) error { - logrus.Infof("File %s not found. Wiping storage directory %s because of suspected dirty shutdown", config.CleanShutdownFile, store.GraphRoot()) - // If we do not do this, we may leak other resources that are not directly in the graphroot. - // Erroring here should not be fatal though, it's a best effort cleanup +func RemoveStorageDirectory(config *libconfig.Config, store cstorage.Store, force bool) error { + // If we do not do this, we may leak other resources that are not directly + // in the graphroot. Erroring here should not be fatal though, it's a best + // effort cleanup. if err := store.Wipe(); err != nil { - logrus.Infof("Failed to wipe storage cleanly: %v", err) - } - // unmount storage or else we will fail with EBUSY - if _, err := store.Shutdown(false); err != nil { - return fmt.Errorf("failed to shutdown storage before wiping: %w", err) + logrus.Infof("Failed to wipe storage: %v", err) + } + + // Unmount storage or else we will fail with -EBUSY. + if _, err := store.Shutdown(true); err != nil { + // CRI-O and Podman are often used together on the same node, + // so the storage directory is shared between the two. + // + // Since a container started by Podman can be running, we will + // try to detect this and return an error rather than proceed + // with a storage wipe. + // + // The storage directory removal can also be forced, which will + // then delete everything irregardless of whether there are any + // containers running at the moment. + if !force && errors.Is(err, cstorage.ErrLayerUsedByContainer) { + return fmt.Errorf("failed to shutdown storage: %w", err) + } + logrus.Warnf("Failed to shutdown storage: %v", err) + + // At this point, storage is most likely corrupted + // beyond repair, as such, remove any potentially + // orphaned mounts that might still be there, and + // prepare to completely remove the storage directory. + if err := cmount.RecursiveUnmount(store.GraphRoot()); err != nil { + logrus.Warnf("Failed to unmount storage: %v", err) + } } - // totally remove storage, whatever is left (possibly orphaned layers) + + // Completely remove storage, whatever is left (possibly orphaned layers). if err := os.RemoveAll(store.GraphRoot()); err != nil { return fmt.Errorf("failed to remove storage directory: %w", err) } return nil } + +// checkQuick returns custom storage check options with only checks known not to be +// resource-intensive enabled. Where known I/O and CPU-bound checks, such as the +// integrity and contents checks, are disabled. +func checkQuick() *cstorage.CheckOptions { + // An alternative to `storage.CheckEverything()` and `storage.CheckMost()` + // helper functions that turn off the expensive layers integrity verification, + // which relies on calculating checksum for the content of the image. This is + // both I/O and CPU intensive and, depending on the size of images, number of + // layers, and number of files within each layer, can significantly impact the + // node performance while the check is running. Additionally, turn off the + // content check, which is also considered expensive. + // + // When the check runs, it can hold up CRI-O, eventually resulting in the node + // being marked as "NotReady" by the kubelet, which is undesirable. + // + // Turning off the integrity check has the side effect of preventing CRI-O from + // detecting whether a file is missing from the image or its content has changed. + return &cstorage.CheckOptions{ + LayerDigests: false, // Disabled for being I/O and CPU intensive. + LayerMountable: true, + LayerContents: false, // Also disabled by `storage.CheckMost()`. + LayerData: true, + ImageData: true, + ContainerData: true, + } +} + +// CheckReportHasErrors checks if the report from a completed storage check includes +// any recoverable errors that storage repair could fix. +func CheckReportHasErrors(report cstorage.CheckReport) bool { + // The `storage.Check()` returns a report object and an error, + // where errors are most likely irrecoverable and should be + // handled as such; the report, on the contrary, can contain + // errors that the `storage.Repair()` could potentially fix. + return len(report.Layers) > 0 || len(report.ROLayers) > 0 || + len(report.Images) > 0 || len(report.ROImages) > 0 || + len(report.Containers) > 0 +} diff --git a/test/crio-check.bats b/test/crio-check.bats new file mode 100644 index 00000000000..c831b5fe111 --- /dev/null +++ b/test/crio-check.bats @@ -0,0 +1,84 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_test +} + +function teardown() { + cleanup_test +} + +function run_crio_check() { + local log_level=("-l" "${CRIO_BINARY_LOG_LEVEL:-"info"}") + + "$CRIO_BINARY_PATH" -c "$CRIO_CONFIG" -d "$CRIO_CONFIG_DIR" "${log_level[@]}" check "$@" +} + +@test "storage directory check should find no issues" { + setup_crio + + # Should verify no storage directory errors. + run_crio_check +} + +@test "storage directory check should find errors" { + setup_crio + + # Remove random layer from the storage directory. + remove_random_storage_layer + + run ! run_crio_check +} + +@test "storage directory check should repair errors" { + setup_crio + + # Remove random layer from the storage directory. + remove_random_storage_layer + + # Should repair damaged storage directory. + run_crio_check --repair + + # Should verify no storage directory errors. + CRIO_BINARY_LOG_LEVEL="debug" run run_crio_check + + [[ "$output" == *"Storage directory ${TESTDIR}/crio has errors: false"* ]] +} + +@test "storage directory check should wipe everything on repair errors" { + start_crio + + pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json) + crictl start "$ctr_id" + + # This will corrupt the storage directory. + cp -r "$TESTDIR"/crio/overlay{,.old} + umount -R -l -f "$TESTDIR"/crio/overlay + rm -Rf "$TESTDIR"/crio/overlay + cp -r "$TESTDIR"/crio/overlay{.old,} + + # Should wipe badly damaged storage directory. + # + # The output is suppressed, as the c/storage library + # can generate a large volume of log lines while the + # repair process runs. A smaller image like "busybux" + # could help alleviate this issue. + run_crio_check --repair --force --wipe &> /dev/null + + # Storage directory wipe should leave only the metadata behind. + size=$(du -sb "$TESTDIR"/crio | cut -f 1) + + # The storage directory wipe did not work if there is more data than 128 KiB left. + if ((size > 1024 * 128)); then + echo "The crio check storage directory wipe did not work" >&3 + return 1 + fi + + # Should verify no storage directory errors. + CRIO_BINARY_LOG_LEVEL="debug" run run_crio_check + + [[ "$output" == *"Storage directory ${TESTDIR}/crio has errors: false"* ]] +} diff --git a/test/crio-wipe.bats b/test/crio-wipe.bats index 0ecf12cb3fe..089b70a9526 100644 --- a/test/crio-wipe.bats +++ b/test/crio-wipe.bats @@ -131,7 +131,7 @@ function start_crio_with_stopped_pod() { run_podman_with_args container exists test } -@test "do clear everything when shutdown file not found" { +@test "clear everything when shutdown file not found" { CONTAINER_INTERNAL_WIPE=false start_crio_with_stopped_pod stop_crio_no_clean @@ -146,7 +146,7 @@ function start_crio_with_stopped_pod() { test_crio_wiped_images } -@test "do clear podman containers when shutdown file not found" { +@test "clear podman containers when shutdown file not found" { if [[ -z "$PODMAN_BINARY" ]]; then skip "Podman not installed" fi @@ -325,9 +325,8 @@ function start_crio_with_stopped_pod() { setup_crio touch "$CONTAINER_CLEAN_SHUTDOWN_FILE.supported" - # Remove a random layer - layer=$(find "$TESTDIR/crio/overlay" -maxdepth 1 -regextype sed -regex '.*/[a-f0-9\-]\{64\}.*' | sort -R | head -n 1) - rm -fr "$layer" + # Remove random layer from the storage directory. + remove_random_storage_layer # Since the clean shutdown supported file is created, # but the clean shutdown file is absent, we will do the @@ -343,3 +342,37 @@ function start_crio_with_stopped_pod() { # Thus, this is really $(crictl images | wc -l) - 1 (for the removed image) + 1 (for the header). [[ $(crictl images | wc -l) == "$num_images" ]] } + +@test "recover from badly corrupted storage directory" { + setup_crio + touch "$CONTAINER_CLEAN_SHUTDOWN_FILE".supported + + start_crio_no_setup + + pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json) + crictl start "$ctr_id" + + # This will corrupt the storage directory. + cp -r "$TESTDIR"/crio/overlay{,.old} + umount -R -l -f "$TESTDIR"/crio/overlay + rm -Rf "$TESTDIR"/crio/overlay + cp -r "$TESTDIR"/crio/overlay{.old,} + + stop_crio_no_clean + + # Remove to trigger internal repair on unclean shutdown. + rm -Rf "$CONTAINER_CLEAN_SHUTDOWN_FILE" + + # Should recovery from badly corrupted storage directory gracefully. + CONTAINER_INTERNAL_REPAIR=true start_crio_no_setup + + # Storage directory wipe should leave only the metadata behind. + size=$(du -sb "$TESTDIR"/crio | cut -f 1) + + # The storage directory wipe did not work if there is more data than 128 KiB left. + if ((size > 1024 * 128)); then + echo "The CRI-O internal repair storage directory wipe did not work" >&3 + return 1 + fi +} diff --git a/test/helpers.bash b/test/helpers.bash index ef0b251b82e..70d7fbb305e 100644 --- a/test/helpers.bash +++ b/test/helpers.bash @@ -744,3 +744,7 @@ function annotations_equal() { received_contains_expected=$? [[ $expected_contains_received -eq 0 ]] && [[ $received_contains_expected -eq 0 ]] } + +function remove_random_storage_layer() { + find "$TESTDIR"/crio/overlay -maxdepth 1 | grep '.*/[a-f0-9\-]\{64\}.*' | head -1 | xargs rm -Rf +} diff --git a/utils/utils.go b/utils/utils.go index e8eefc34320..283761cf8f5 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -10,6 +10,7 @@ import ( "path/filepath" "runtime/pprof" "strconv" + "time" securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/user" @@ -351,3 +352,32 @@ func HandleResizing(resize <-chan remotecommand.TerminalSize, resizeFunc func(si } }() } + +// ParseDuration parses a string that can contain either a human-readable duration +// notation such as "24h" or "5m30s", so a duration with unit, or a string-encoded +// integer value that denotes the number of seconds and returns a corresponding +// `time.Duration` type. Parsing a floating point value encoded as string without +// a duration unit is not supported. +// +// An assumption is made that the duration value cannot be negative, and as such, +// any negative value will be converted to a positive duration automatically. +func ParseDuration(s string) (time.Duration, error) { + var t time.Duration + + n, err := strconv.ParseInt(s, 10, 64) + if err == nil { + t = time.Duration(n) * time.Second + } else { + t, err = time.ParseDuration(s) + } + if err != nil { + return 0, err + } + + // Assume that time does not move backwards. + if t < 0 { + t = -t + } + + return t, nil +} diff --git a/utils/utils_test.go b/utils/utils_test.go index 93d9ac93e74..123ab2993ec 100644 --- a/utils/utils_test.go +++ b/utils/utils_test.go @@ -5,6 +5,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/containers/storage/pkg/unshare" "github.com/cri-o/cri-o/internal/dbusmgr" @@ -345,6 +346,98 @@ var _ = t.Describe("Utils", func() { Expect(newaddgids).To(Equal(addgids)) }) }) + + t.Describe("ParseDuration", func() { + It("should succeed with duration value with unit", func() { + // Given + // When + duration, err := utils.ParseDuration("5s") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(5 * time.Second)) + }) + + It("should succeed with duration value without unit", func() { + // Given + // When + duration, err := utils.ParseDuration("5") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(5 * time.Second)) + }) + + It("should succeed with negative duration value with unit", func() { + // Given + // When + duration, err := utils.ParseDuration("-5s") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(5 * time.Second)) + }) + + It("should succeed with negative duration value without unit", func() { + // Given + // When + duration, err := utils.ParseDuration("-5") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(5 * time.Second)) + }) + + It("should succeed with zero as duration value without unit", func() { + // Given + // When + duration, err := utils.ParseDuration("0") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(time.Duration(0))) + }) + + It("should succeed with floating point duration with unit", func() { + // Given + // When + duration, err := utils.ParseDuration("1.234s") + + // Then + Expect(err).ToNot(HaveOccurred()) + Expect(duration).To(Equal(time.Duration(1.234 * float64(time.Second)))) + }) + + It("should fail with invalid floating point duration without unit", func() { + // Given + // When + duration, err := utils.ParseDuration("1.234") + + // Then + Expect(err).To(HaveOccurred()) + Expect(duration).To(Equal(time.Duration(0))) + }) + + It("should fail with invalid duration", func() { + // Given + // When + duration, err := utils.ParseDuration("test") + + // Then + Expect(err).To(HaveOccurred()) + Expect(duration).To(Equal(time.Duration(0))) + }) + + It("should fail with empty duration", func() { + // Given + // When + duration, err := utils.ParseDuration("") + + // Then + Expect(err).To(HaveOccurred()) + Expect(duration).To(Equal(time.Duration(0))) + }) + }) }) func createEtcFiles() string {