From 39ea33e2948a12cb7f1b8ed89021d95c7dffc21f Mon Sep 17 00:00:00 2001 From: Md Sahil Date: Fri, 4 Aug 2023 23:20:56 +0530 Subject: [PATCH 1/2] feat: Added a feature to check at reboot time shutdown was clean or not, If it was not clean then apply repair logic Signed-off-by: Md Sahil --- internal/criocli/wipe.go | 41 ++---------------------- internal/lib/container_server.go | 55 ++++++++++++++++++++++++++++++++ test/crio-wipe.bats | 23 +++++++++++++ 3 files changed, 81 insertions(+), 38 deletions(-) diff --git a/internal/criocli/wipe.go b/internal/criocli/wipe.go index 385581206c3..96c0f2af789 100644 --- a/internal/criocli/wipe.go +++ b/internal/criocli/wipe.go @@ -2,13 +2,12 @@ package criocli import ( "errors" - "fmt" "os" cstorage "github.com/containers/storage" + "github.com/cri-o/cri-o/internal/lib" "github.com/cri-o/cri-o/internal/storage" "github.com/cri-o/cri-o/internal/version" - crioconf "github.com/cri-o/cri-o/pkg/config" json "github.com/json-iterator/go" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" @@ -63,8 +62,8 @@ func crioWipe(c *cli.Context) error { // Then, check whether crio has shutdown with time to sync. // Note: this is only needed if the node rebooted. // If there wasn't time to sync, we should clear the storage directory - if shouldWipeContainers && shutdownWasUnclean(config) { - return handleCleanShutdown(config, store) + if shouldWipeContainers && lib.ShutdownWasUnclean(config) { + return lib.HandleUncleanShutdown(config, store) } // If crio is configured to wipe internally (and `--force` wasn't set) @@ -97,40 +96,6 @@ func crioWipe(c *cli.Context) error { return nil } -func shutdownWasUnclean(config *crioconf.Config) bool { - // CleanShutdownFile not configured, skip - if config.CleanShutdownFile == "" { - return false - } - // CleanShutdownFile isn't supported, skip - if _, err := os.Stat(config.CleanShutdownSupportedFileName()); err != nil { - return false - } - // CleanShutdownFile is present, indicating clean shutdown - if _, err := os.Stat(config.CleanShutdownFile); err == nil { - return false - } - return true -} - -func handleCleanShutdown(config *crioconf.Config, store cstorage.Store) error { - logrus.Infof("File %s not found. Wiping storage directory %s because of suspected dirty shutdown", config.CleanShutdownFile, store.GraphRoot()) - // If we do not do this, we may leak other resources that are not directly in the graphroot. - // Erroring here should not be fatal though, it's a best effort cleanup - if err := store.Wipe(); err != nil { - logrus.Infof("Failed to wipe storage cleanly: %v", err) - } - // unmount storage or else we will fail with EBUSY - if _, err := store.Shutdown(false); err != nil { - return fmt.Errorf("failed to shutdown storage before wiping: %w", err) - } - // totally remove storage, whatever is left (possibly orphaned layers) - if err := os.RemoveAll(store.GraphRoot()); err != nil { - return fmt.Errorf("failed to remove storage directory: %w", err) - } - return nil -} - type ContainerStore struct { store cstorage.Store } diff --git a/internal/lib/container_server.go b/internal/lib/container_server.go index 2636665b843..469408fc629 100644 --- a/internal/lib/container_server.go +++ b/internal/lib/container_server.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "os" "path/filepath" "sync" "time" @@ -102,6 +103,26 @@ func New(ctx context.Context, configIface libconfig.Iface) (*ContainerServer, er return nil, fmt.Errorf("cannot create container server: interface is nil") } + if ShutdownWasUnclean(config) { + checkOptions := cstorage.CheckEverything() + report, err := store.Check(checkOptions) + if err != nil { + err = HandleUncleanShutdown(config, store) + if err != nil { + return nil, err + } + } + options := cstorage.RepairOptions{ + RemoveContainers: true, + } + if errs := store.Repair(report, &options); len(errs) > 0 { + err = HandleUncleanShutdown(config, store) + if err != nil { + return nil, err + } + } + } + imageService, err := storage.GetImageService(ctx, store, config) if err != nil { return nil, err @@ -776,3 +797,37 @@ func (c *ContainerServer) UpdateContainerLinuxResources(ctr *oci.Container, reso c.state.containers.Add(ctr.ID(), ctr) } + +func ShutdownWasUnclean(config *libconfig.Config) bool { + // CleanShutdownFile not configured, skip + if config.CleanShutdownFile == "" { + return false + } + // CleanShutdownFile isn't supported, skip + if _, err := os.Stat(config.CleanShutdownSupportedFileName()); err != nil { + return false + } + // CleanShutdownFile is present, indicating clean shutdown + if _, err := os.Stat(config.CleanShutdownFile); err == nil { + return false + } + return true +} + +func HandleUncleanShutdown(config *libconfig.Config, store cstorage.Store) error { + logrus.Infof("File %s not found. Wiping storage directory %s because of suspected dirty shutdown", config.CleanShutdownFile, store.GraphRoot()) + // If we do not do this, we may leak other resources that are not directly in the graphroot. + // Erroring here should not be fatal though, it's a best effort cleanup + if err := store.Wipe(); err != nil { + logrus.Infof("Failed to wipe storage cleanly: %v", err) + } + // unmount storage or else we will fail with EBUSY + if _, err := store.Shutdown(false); err != nil { + return fmt.Errorf("failed to shutdown storage before wiping: %w", err) + } + // totally remove storage, whatever is left (possibly orphaned layers) + if err := os.RemoveAll(store.GraphRoot()); err != nil { + return fmt.Errorf("failed to remove storage directory: %w", err) + } + return nil +} diff --git a/test/crio-wipe.bats b/test/crio-wipe.bats index 00d7fcd330e..472b5d35610 100644 --- a/test/crio-wipe.bats +++ b/test/crio-wipe.bats @@ -320,3 +320,26 @@ function start_crio_with_stopped_pod() { # make sure network resources were cleaned up run ! ls "$CNI_RESULTS_DIR"/*"$pod_id"* } + +@test "clean up image if corrupted on server restore" { + setup_crio + touch "$CONTAINER_CLEAN_SHUTDOWN_FILE.supported" + + # Remove a random layer + layer=$(find "$TESTDIR/crio/overlay" -maxdepth 1 -regextype sed -regex '.*/[a-f0-9\-]\{64\}.*' | sort -R | head -n 1) + rm -fr "$layer" + + # Since the clean shutdown supported file is created, + # but the clean shutdown file is absent, we will do the + # c/storage check/repair. + start_crio_no_setup + + # Since one of the layers was removed, the image would be corrupted, so we expect + # one to have been removed. + num_images=${#IMAGES[@]} + + # We start with $num_images images, and remove one with the layer removal above. + # `crictl images` adds one additional row for the table header. + # Thus, this is really $(crictl images | wc -l) - 1 (for the removed image) + 1 (for the header). + [[ $(crictl images | wc -l) == "$num_images" ]] +} From 7e3522a9cd3b459184dc74faebde8b2a86377adf Mon Sep 17 00:00:00 2001 From: Md Sahil Date: Tue, 15 Aug 2023 01:39:06 +0530 Subject: [PATCH 2/2] Added a flag internal-repair Signed-off-by: Md Sahil --- completions/bash/crio | 1 + completions/fish/crio.fish | 1 + completions/zsh/_crio | 1 + docs/crio.8.md | 3 +++ docs/crio.conf.5.md | 4 ++++ internal/criocli/criocli.go | 9 +++++++++ internal/lib/container_server.go | 2 +- pkg/config/config.go | 4 ++++ pkg/config/template.go | 11 +++++++++++ test/crio-wipe.bats | 2 +- 10 files changed, 36 insertions(+), 2 deletions(-) diff --git a/completions/bash/crio b/completions/bash/crio index 5ca934c97aa..7d7c7e0c3a6 100755 --- a/completions/bash/crio +++ b/completions/bash/crio @@ -62,6 +62,7 @@ h --image-volumes --infra-ctr-cpuset --insecure-registry +--internal-repair --internal-wipe --irqbalance-config-file --irqbalance-config-restore-file diff --git a/completions/fish/crio.fish b/completions/fish/crio.fish index 4f2859d8b8c..002bd8ccb37 100644 --- a/completions/fish/crio.fish +++ b/completions/fish/crio.fish @@ -98,6 +98,7 @@ complete -c crio -n '__fish_crio_no_subcommand' -f -l insecure-registry -r -d 'E be enabled for testing purposes**. For increased security, users should add their CA to their system\'s list of trusted CAs instead of using \'--insecure-registry\'.' +complete -c crio -n '__fish_crio_no_subcommand' -f -l internal-repair -d 'If true, CRI-O will check if the container and image storage was corrupted after a sudden restart, and attempt to repair the storage if it was.' complete -c crio -n '__fish_crio_no_subcommand' -f -l internal-wipe -d 'Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations. This option is deprecated, and will be removed in the future.' complete -c crio -n '__fish_crio_no_subcommand' -f -l irqbalance-config-file -r -d 'The irqbalance service config file which is used by CRI-O.' complete -c crio -n '__fish_crio_no_subcommand' -f -l irqbalance-config-restore-file -r -d 'Determines if CRI-O should attempt to restore the irqbalance config at startup with the mask in this file. Use the \'disable\' value to disable the restore flow entirely.' diff --git a/completions/zsh/_crio b/completions/zsh/_crio index 3c4a0ee674f..3cd25cd1dbe 100644 --- a/completions/zsh/_crio +++ b/completions/zsh/_crio @@ -69,6 +69,7 @@ it later with **--config**. Global options will modify the output.' '--image-volumes' '--infra-ctr-cpuset' '--insecure-registry' + '--internal-repair' '--internal-wipe' '--irqbalance-config-file' '--irqbalance-config-restore-file' diff --git a/docs/crio.8.md b/docs/crio.8.md index 3ad061124a4..91b0d73ab9e 100644 --- a/docs/crio.8.md +++ b/docs/crio.8.md @@ -60,6 +60,7 @@ crio [--image-volumes]=[value] [--infra-ctr-cpuset]=[value] [--insecure-registry]=[value] +[--internal-repair] [--internal-wipe] [--irqbalance-config-file]=[value] [--irqbalance-config-restore-file]=[value] @@ -289,6 +290,8 @@ crio [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...] their CA to their system's list of trusted CAs instead of using '--insecure-registry'. +**--internal-repair**: If true, CRI-O will check if the container and image storage was corrupted after a sudden restart, and attempt to repair the storage if it was. + **--internal-wipe**: Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations. This option is deprecated, and will be removed in the future. **--irqbalance-config-file**="": The irqbalance service config file which is used by CRI-O. (default: "/etc/sysconfig/irqbalance") diff --git a/docs/crio.conf.5.md b/docs/crio.conf.5.md index da6b4342cf2..4165316b93d 100644 --- a/docs/crio.conf.5.md +++ b/docs/crio.conf.5.md @@ -59,6 +59,10 @@ CRI-O reads its storage defaults from the containers-storage.conf(5) file locate Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations. +**internal_repair**=false + InternalRepair is whether CRI-O should check if the container and image storage was corrupted after a sudden restart. + If it was, CRI-O also attempts to repair the storage. + **clean_shutdown_file**="/var/lib/crio/clean.shutdown" Location for CRI-O to lay down the clean shutdown file. It is used to check whether crio had time to sync before shutting down. diff --git a/internal/criocli/criocli.go b/internal/criocli/criocli.go index e15ae4a1f71..d10526788c7 100644 --- a/internal/criocli/criocli.go +++ b/internal/criocli/criocli.go @@ -327,6 +327,9 @@ func mergeConfig(config *libconfig.Config, ctx *cli.Context) error { if ctx.IsSet("internal-wipe") { config.InternalWipe = ctx.Bool("internal-wipe") } + if ctx.IsSet("internal-repair") { + config.InternalRepair = ctx.Bool("internal-repair") + } if ctx.IsSet("enable-metrics") { config.EnableMetrics = ctx.Bool("enable-metrics") } @@ -1078,6 +1081,12 @@ func getCrioFlags(defConf *libconfig.Config) []cli.Flag { Value: defConf.InternalWipe, EnvVars: []string{"CONTAINER_INTERNAL_WIPE"}, }, + &cli.BoolFlag{ + Name: "internal-repair", + Usage: "If true, CRI-O will check if the container and image storage was corrupted after a sudden restart, and attempt to repair the storage if it was.", + EnvVars: []string{"CONTAINER_INTERNAL_REPAIR"}, + Value: defConf.InternalRepair, + }, &cli.StringFlag{ Name: "infra-ctr-cpuset", Usage: "CPU set to run infra containers, if not specified CRI-O will use all online CPUs to run infra containers.", diff --git a/internal/lib/container_server.go b/internal/lib/container_server.go index 469408fc629..dc6df88eebc 100644 --- a/internal/lib/container_server.go +++ b/internal/lib/container_server.go @@ -103,7 +103,7 @@ func New(ctx context.Context, configIface libconfig.Iface) (*ContainerServer, er return nil, fmt.Errorf("cannot create container server: interface is nil") } - if ShutdownWasUnclean(config) { + if config.InternalRepair && ShutdownWasUnclean(config) { checkOptions := cstorage.CheckEverything() report, err := store.Check(checkOptions) if err != nil { diff --git a/pkg/config/config.go b/pkg/config/config.go index 2d0c44e3755..67c503bb0a3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -168,6 +168,9 @@ type RootConfig struct { // If set to false, one must use the external command `crio wipe` to wipe the containers and images in these situations. // The option InternalWipe is deprecated, and will be removed in a future release. InternalWipe bool `toml:"internal_wipe"` + + // InternalRepair is used to repair the affected images. + InternalRepair bool `toml:"internal_repair"` } // GetStore returns the container storage for a given configuration @@ -810,6 +813,7 @@ func DefaultConfig() (*Config, error) { VersionFile: CrioVersionPathTmp, CleanShutdownFile: CrioCleanShutdownFile, InternalWipe: true, + InternalRepair: false, }, APIConfig: APIConfig{ Listen: CrioSocketPath, diff --git a/pkg/config/template.go b/pkg/config/template.go index ed24480d975..1b0cfc07565 100644 --- a/pkg/config/template.go +++ b/pkg/config/template.go @@ -145,6 +145,11 @@ func initCrioTemplateConfig(c *Config) ([]*templateConfigValue, error) { group: crioRootConfig, isDefaultValue: simpleEqual(dc.InternalWipe, c.InternalWipe), }, + { + templateString: templateStringCrioInternalRepair, + group: crioRootConfig, + isDefaultValue: simpleEqual(dc.InternalRepair, c.InternalRepair), + }, { templateString: templateStringCrioCleanShutdownFile, group: crioRootConfig, @@ -764,6 +769,12 @@ const templateStringCrioInternalWipe = `# InternalWipe is whether CRI-O should w ` +const templateStringCrioInternalRepair = `# InternalRepair is whether CRI-O should check if the container and image storage was corrupted after a sudden restart. +# If it was, CRI-O also attempts to repair the storage. +{{ $.Comment }}internal_repair = {{ .InternalRepair }} + +` + const templateStringCrioAPI = `# The crio.api table contains settings for the kubelet/gRPC interface. [crio.api] diff --git a/test/crio-wipe.bats b/test/crio-wipe.bats index 472b5d35610..0ecf12cb3fe 100644 --- a/test/crio-wipe.bats +++ b/test/crio-wipe.bats @@ -332,7 +332,7 @@ function start_crio_with_stopped_pod() { # Since the clean shutdown supported file is created, # but the clean shutdown file is absent, we will do the # c/storage check/repair. - start_crio_no_setup + CONTAINER_INTERNAL_REPAIR=true start_crio_no_setup # Since one of the layers was removed, the image would be corrupted, so we expect # one to have been removed.