diff --git a/cmd/crio/wipe.go b/cmd/crio/wipe.go index e633af1ed86..44a7569f5aa 100644 --- a/cmd/crio/wipe.go +++ b/cmd/crio/wipe.go @@ -51,6 +51,15 @@ func crioWipe(c *cli.Context) error { } } + // If crio is configured to wipe internally (and `--force` wasn't set) + // the `crio wipe` command has nothing left to do, + // as the remaining work will be done on server startup. + if config.InternalWipe && !c.IsSet("force") { + return nil + } + + logrus.Infof("Internal wipe not set, meaning crio wipe will wipe. In the future, all wipes after reboot will happen when starting the crio server.") + // if we should not wipe, exit with no error if !shouldWipeContainers { // we should not wipe images without wiping containers diff --git a/completions/bash/crio b/completions/bash/crio index 50385138020..3e53048e531 100755 --- a/completions/bash/crio +++ b/completions/bash/crio @@ -47,6 +47,7 @@ h --image-volumes --infra-ctr-cpuset --insecure-registry +--internal-wipe --irqbalance-config-file --listen --log diff --git a/completions/fish/crio.fish b/completions/fish/crio.fish index e1591d74f7d..e56a20dacf5 100644 --- a/completions/fish/crio.fish +++ b/completions/fish/crio.fish @@ -85,6 +85,7 @@ complete -c crio -n '__fish_crio_no_subcommand' -f -l insecure-registry -r -d 'E be enabled for testing purposes**. For increased security, users should add their CA to their system\'s list of trusted CAs instead of using \'--insecure-registry\'.' +complete -c crio -n '__fish_crio_no_subcommand' -f -l internal-wipe -d 'Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations.' complete -c crio -n '__fish_crio_no_subcommand' -f -l irqbalance-config-file -r -d 'The irqbalance service config file which is used by CRI-O.' complete -c crio -n '__fish_crio_no_subcommand' -l listen -r -d 'Path to the CRI-O socket' complete -c crio -n '__fish_crio_no_subcommand' -l log -r -d 'Set the log file path where internal debug information is written' diff --git a/completions/zsh/_crio b/completions/zsh/_crio index 233e0c8c41e..991c0b6b43c 100644 --- a/completions/zsh/_crio +++ b/completions/zsh/_crio @@ -7,7 +7,7 @@ it later with **--config**. Global options will modify the output.' 'version:dis _describe 'commands' cmds local -a opts - opts=('--additional-devices' '--apparmor-profile' '--big-files-temporary-dir' '--bind-mount-prefix' '--cgroup-manager' '--cni-config-dir' '--cni-default-network' '--cni-plugin-dir' '--config' '--config-dir' '--conmon' '--conmon-cgroup' '--conmon-env' '--container-attach-socket-dir' '--container-exits-dir' '--ctr-stop-timeout' '--decryption-keys-path' '--default-capabilities' '--default-env' '--default-mounts-file' '--default-runtime' '--default-sysctls' '--default-transport' '--default-ulimits' '--drop-infra-ctr' '--enable-metrics' '--enable-profile-unix-socket' '--gid-mappings' '--global-auth-file' '--grpc-max-recv-msg-size' '--grpc-max-send-msg-size' '--hooks-dir' '--image-volumes' '--infra-ctr-cpuset' '--insecure-registry' '--irqbalance-config-file' '--listen' '--log' '--log-dir' '--log-filter' '--log-format' '--log-journald' '--log-level' '--log-size-max' '--manage-ns-lifecycle' '--metrics-port' '--metrics-socket' '--namespaces-dir' '--no-pivot' '--pause-command' '--pause-image' '--pause-image-auth-file' '--pids-limit' '--pinns-path' '--profile' '--profile-port' '--read-only' '--registries-conf' '--registry' '--root' '--runroot' '--runtimes' '--seccomp-profile' '--seccomp-use-default-when-empty' '--selinux' '--separate-pull-cgroup' '--signature-policy' '--storage-driver' '--storage-opt' '--stream-address' '--stream-enable-tls' '--stream-idle-timeout' '--stream-port' '--stream-tls-ca' '--stream-tls-cert' '--stream-tls-key' '--uid-mappings' '--version-file' '--version-file-persist' '--help' '--version') + opts=('--additional-devices' '--apparmor-profile' '--big-files-temporary-dir' '--bind-mount-prefix' '--cgroup-manager' '--cni-config-dir' '--cni-default-network' '--cni-plugin-dir' '--config' '--config-dir' '--conmon' '--conmon-cgroup' '--conmon-env' '--container-attach-socket-dir' '--container-exits-dir' '--ctr-stop-timeout' '--decryption-keys-path' '--default-capabilities' '--default-env' '--default-mounts-file' '--default-runtime' '--default-sysctls' '--default-transport' '--default-ulimits' '--drop-infra-ctr' '--enable-metrics' '--enable-profile-unix-socket' '--gid-mappings' '--global-auth-file' '--grpc-max-recv-msg-size' '--grpc-max-send-msg-size' '--hooks-dir' '--image-volumes' '--infra-ctr-cpuset' '--insecure-registry' '--internal-wipe' '--irqbalance-config-file' '--listen' '--log' '--log-dir' '--log-filter' '--log-format' '--log-journald' '--log-level' '--log-size-max' '--manage-ns-lifecycle' '--metrics-port' '--metrics-socket' '--namespaces-dir' '--no-pivot' '--pause-command' '--pause-image' '--pause-image-auth-file' '--pids-limit' '--pinns-path' '--profile' '--profile-port' '--read-only' '--registries-conf' '--registry' '--root' '--runroot' '--runtimes' '--seccomp-profile' '--seccomp-use-default-when-empty' '--selinux' '--separate-pull-cgroup' '--signature-policy' '--storage-driver' '--storage-opt' '--stream-address' '--stream-enable-tls' '--stream-idle-timeout' '--stream-port' '--stream-tls-ca' '--stream-tls-cert' '--stream-tls-key' '--uid-mappings' '--version-file' '--version-file-persist' '--help' '--version') _describe 'global options' opts return diff --git a/docs/crio.8.md b/docs/crio.8.md index 1b118364fca..b31c670fe66 100644 --- a/docs/crio.8.md +++ b/docs/crio.8.md @@ -47,6 +47,7 @@ crio [--image-volumes]=[value] [--infra-ctr-cpuset]=[value] [--insecure-registry]=[value] +[--internal-wipe] [--irqbalance-config-file]=[value] [--listen]=[value] [--log-dir]=[value] @@ -230,6 +231,8 @@ crio [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...] their CA to their system's list of trusted CAs instead of using '--insecure-registry'. (default: []) +**--internal-wipe**: Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations. + **--irqbalance-config-file**="": The irqbalance service config file which is used by CRI-O. (default: /etc/sysconfig/irqbalance) **--listen**="": Path to the CRI-O socket (default: /var/run/crio/crio.sock) diff --git a/docs/crio.conf.5.md b/docs/crio.conf.5.md index 21d42931d8b..d85d9a1d5eb 100644 --- a/docs/crio.conf.5.md +++ b/docs/crio.conf.5.md @@ -54,6 +54,10 @@ CRI-O reads its storage defaults from the containers-storage.conf(5) file locate It is used to check if crio wipe should wipe images, which should only happen when CRI-O has been upgraded +**internal_wipe**=false + Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. + If set to false, one must run `crio wipe` to wipe the containers and images in these situations. + ## CRIO.API TABLE The `crio.api` table contains settings for the kubelet/gRPC interface. diff --git a/internal/criocli/criocli.go b/internal/criocli/criocli.go index 1be667a14e4..3bc04eea53a 100644 --- a/internal/criocli/criocli.go +++ b/internal/criocli/criocli.go @@ -284,6 +284,9 @@ func mergeConfig(config *libconfig.Config, ctx *cli.Context) error { if ctx.IsSet("version-file-persist") { config.VersionFilePersist = ctx.String("version-file-persist") } + if ctx.IsSet("internal-wipe") { + config.InternalWipe = ctx.Bool("internal-wipe") + } if ctx.IsSet("enable-metrics") { config.EnableMetrics = ctx.Bool("enable-metrics") } @@ -827,6 +830,12 @@ func getCrioFlags(defConf *libconfig.Config) []cli.Flag { EnvVars: []string{"CONTAINER_VERSION_FILE_PERSIST"}, TakesFile: true, }, + &cli.BoolFlag{ + Name: "internal-wipe", + Usage: "Whether CRI-O should wipe containers after a reboot and images after an upgrade when the server starts. If set to false, one must run `crio wipe` to wipe the containers and images in these situations.", + Value: defConf.InternalWipe, + EnvVars: []string{"CONTAINER_INTERNAL_WIPE"}, + }, &cli.StringFlag{ Name: "infra-ctr-cpuset", Usage: "CPU set to run infra containers, if not specified CRI-O will use all online CPUs to run infra containers (default: '').", diff --git a/internal/lib/container_server.go b/internal/lib/container_server.go index 5a67ea7aaf9..870922c925e 100644 --- a/internal/lib/container_server.go +++ b/internal/lib/container_server.go @@ -144,23 +144,23 @@ func New(ctx context.Context, configIface libconfig.Iface) (*ContainerServer, er } // LoadSandbox loads a sandbox from the disk into the sandbox store -func (c *ContainerServer) LoadSandbox(id string) (retErr error) { +func (c *ContainerServer) LoadSandbox(id string) (sb *sandbox.Sandbox, retErr error) { config, err := c.store.FromContainerDirectory(id, "config.json") if err != nil { - return err + return nil, err } var m rspec.Spec if err := json.Unmarshal(config, &m); err != nil { - return errors.Wrap(err, "error unmarshalling sandbox spec") + return nil, errors.Wrap(err, "error unmarshalling sandbox spec") } labels := make(map[string]string) if err := json.Unmarshal([]byte(m.Annotations[annotations.Labels]), &labels); err != nil { - return errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Labels) + return nil, errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Labels) } name := m.Annotations[annotations.Name] name, err = c.ReservePodName(id, name) if err != nil { - return err + return nil, err } defer func() { if retErr != nil { @@ -169,7 +169,7 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { }() var metadata sandbox.Metadata if err := json.Unmarshal([]byte(m.Annotations[annotations.Metadata]), &metadata); err != nil { - return errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Metadata) + return nil, errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Metadata) } processLabel := m.Process.SelinuxLabel @@ -179,29 +179,29 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { kubeAnnotations := make(map[string]string) if err := json.Unmarshal([]byte(m.Annotations[annotations.Annotations]), &kubeAnnotations); err != nil { - return errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Annotations) + return nil, errors.Wrapf(err, "error unmarshalling %s annotation", annotations.Annotations) } portMappings := []*hostport.PortMapping{} if err := json.Unmarshal([]byte(m.Annotations[annotations.PortMappings]), &portMappings); err != nil { - return errors.Wrapf(err, "error unmarshalling %s annotation", annotations.PortMappings) + return nil, errors.Wrapf(err, "error unmarshalling %s annotation", annotations.PortMappings) } privileged := isTrue(m.Annotations[annotations.PrivilegedRuntime]) hostNetwork := isTrue(m.Annotations[annotations.HostNetwork]) nsOpts := sandbox.NamespaceOption{} if err := json.Unmarshal([]byte(m.Annotations[annotations.NamespaceOptions]), &nsOpts); err != nil { - return errors.Wrapf(err, "error unmarshalling %s annotation", annotations.NamespaceOptions) + return nil, errors.Wrapf(err, "error unmarshalling %s annotation", annotations.NamespaceOptions) } created, err := time.Parse(time.RFC3339Nano, m.Annotations[annotations.Created]) if err != nil { - return errors.Wrap(err, "parsing created timestamp annotation") + return nil, errors.Wrap(err, "parsing created timestamp annotation") } - sb, err := sandbox.New(id, m.Annotations[annotations.Namespace], name, m.Annotations[annotations.KubeName], filepath.Dir(m.Annotations[annotations.LogPath]), labels, kubeAnnotations, processLabel, mountLabel, &metadata, m.Annotations[annotations.ShmPath], m.Annotations[annotations.CgroupParent], privileged, m.Annotations[annotations.RuntimeHandler], m.Annotations[annotations.ResolvPath], m.Annotations[annotations.HostName], portMappings, hostNetwork, created, m.Annotations[crioann.UsernsModeAnnotation]) + sb, err = sandbox.New(id, m.Annotations[annotations.Namespace], name, m.Annotations[annotations.KubeName], filepath.Dir(m.Annotations[annotations.LogPath]), labels, kubeAnnotations, processLabel, mountLabel, &metadata, m.Annotations[annotations.ShmPath], m.Annotations[annotations.CgroupParent], privileged, m.Annotations[annotations.RuntimeHandler], m.Annotations[annotations.ResolvPath], m.Annotations[annotations.HostName], portMappings, hostNetwork, created, m.Annotations[crioann.UsernsModeAnnotation]) if err != nil { - return err + return nil, err } sb.AddHostnamePath(m.Annotations[annotations.HostnamePath]) sb.SetSeccompProfilePath(spp) @@ -209,35 +209,26 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { // We add an NS only if we can load a permanent one. // Otherwise, the sandbox will live in the host namespace. - if c.config.ManageNSLifecycle { - netNsPath, err := configNsPath(&m, rspec.NetworkNamespace) - if err == nil { - if nsErr := sb.NetNsJoin(netNsPath); nsErr != nil { - return nsErr - } - } - ipcNsPath, err := configNsPath(&m, rspec.IPCNamespace) - if err == nil { - if nsErr := sb.IpcNsJoin(ipcNsPath); nsErr != nil { - return nsErr - } - } - utsNsPath, err := configNsPath(&m, rspec.UTSNamespace) + namespacesToJoin := []struct { + rspecNS rspec.LinuxNamespaceType + joinFunc func(string) error + }{ + {rspecNS: rspec.NetworkNamespace, joinFunc: sb.NetNsJoin}, + {rspecNS: rspec.IPCNamespace, joinFunc: sb.IpcNsJoin}, + {rspecNS: rspec.UTSNamespace, joinFunc: sb.UtsNsJoin}, + {rspecNS: rspec.UserNamespace, joinFunc: sb.UserNsJoin}, + } + for _, namespaceToJoin := range namespacesToJoin { + path, err := configNsPath(&m, namespaceToJoin.rspecNS) if err == nil { - if nsErr := sb.UtsNsJoin(utsNsPath); nsErr != nil { - return nsErr - } - } - userNsPath, err := configNsPath(&m, rspec.UserNamespace) - if err == nil { - if nsErr := sb.UserNsJoin(userNsPath); nsErr != nil { - return nsErr + if nsErr := namespaceToJoin.joinFunc(path); nsErr != nil { + return sb, nsErr } } } if err := c.AddSandbox(sb); err != nil { - return err + return sb, err } defer func() { @@ -250,19 +241,19 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { sandboxPath, err := c.store.ContainerRunDirectory(id) if err != nil { - return err + return sb, err } sandboxDir, err := c.store.ContainerDirectory(id) if err != nil { - return err + return sb, err } cID := m.Annotations[annotations.ContainerID] cname, err := c.ReserveContainerName(cID, m.Annotations[annotations.ContainerName]) if err != nil { - return err + return sb, err } defer func() { if retErr != nil { @@ -282,7 +273,7 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { if !wasSpoofed { scontainer, err = oci.NewContainer(m.Annotations[annotations.ContainerID], cname, sandboxPath, m.Annotations[annotations.LogPath], labels, m.Annotations, kubeAnnotations, m.Annotations[annotations.Image], "", "", nil, id, false, false, false, sb.RuntimeHandler(), sandboxDir, created, m.Annotations["org.opencontainers.image.stopSignal"]) if err != nil { - return err + return sb, err } scontainer.SetSpec(&m) scontainer.SetMountPoint(m.Annotations[annotations.MountPoint]) @@ -290,7 +281,7 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { if m.Annotations[annotations.Volumes] != "" { containerVolumes := []oci.ContainerVolume{} if err = json.Unmarshal([]byte(m.Annotations[annotations.Volumes]), &containerVolumes); err != nil { - return fmt.Errorf("failed to unmarshal container volumes: %v", err) + return sb, fmt.Errorf("failed to unmarshal container volumes: %v", err) } for _, cv := range containerVolumes { scontainer.AddVolume(cv) @@ -301,50 +292,28 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { } if err := c.ContainerStateFromDisk(scontainer); err != nil { - return fmt.Errorf("error reading sandbox state from disk %q: %v", scontainer.ID(), err) + return sb, fmt.Errorf("error reading sandbox state from disk %q: %v", scontainer.ID(), err) } // We write back the state because it is possible that crio did not have a chance to // read the exit file and persist exit code into the state on reboot. if err := c.ContainerStateToDisk(scontainer); err != nil { - return fmt.Errorf("failed to write container %q state to disk: %v", scontainer.ID(), err) + return sb, fmt.Errorf("failed to write container %q state to disk: %v", scontainer.ID(), err) } if err := sb.SetInfraContainer(scontainer); err != nil { - return err - } - - // We add an NS only if we can load a permanent one. - // Otherwise, the sandbox will live in the host namespace. - if c.config.ManageNSLifecycle || wasSpoofed { - namespacesToJoin := []struct { - rspecNS rspec.LinuxNamespaceType - joinFunc func(string) error - }{ - {rspecNS: rspec.NetworkNamespace, joinFunc: sb.NetNsJoin}, - {rspecNS: rspec.IPCNamespace, joinFunc: sb.IpcNsJoin}, - {rspecNS: rspec.UTSNamespace, joinFunc: sb.UtsNsJoin}, - {rspecNS: rspec.UserNamespace, joinFunc: sb.UserNsJoin}, - } - for _, namespaceToJoin := range namespacesToJoin { - path, err := configNsPath(&m, namespaceToJoin.rspecNS) - if err == nil { - if nsErr := namespaceToJoin.joinFunc(path); err != nil { - return nsErr - } - } - } + return sb, err } sb.SetCreated() if err := label.ReserveLabel(processLabel); err != nil { - return err + return sb, err } sb.RestoreStopped() if err := c.ctrIDIndex.Add(scontainer.ID()); err != nil { - return err + return sb, err } defer func() { if retErr != nil { @@ -354,9 +323,9 @@ func (c *ContainerServer) LoadSandbox(id string) (retErr error) { } }() if err := c.podIDIndex.Add(id); err != nil { - return err + return sb, err } - return nil + return sb, nil } func configNsPath(spec *rspec.Spec, nsType rspec.LinuxNamespaceType) (string, error) { diff --git a/internal/lib/container_server_test.go b/internal/lib/container_server_test.go index dee98b2de8a..22537957871 100644 --- a/internal/lib/container_server_test.go +++ b/internal/lib/container_server_test.go @@ -165,9 +165,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(testManifest) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).To(BeNil()) }) @@ -181,9 +182,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(manifest) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).To(BeNil()) }) @@ -197,9 +199,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(manifest) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).To(BeNil()) }) @@ -208,9 +211,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(testManifest) // When - err := sut.LoadSandbox("") + sb, err := sut.LoadSandbox("") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -223,9 +227,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(manifest) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -238,9 +243,10 @@ var _ = t.Describe("ContainerServer", func() { mockDirs(manifest) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -257,9 +263,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -274,9 +281,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -293,9 +301,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -312,9 +321,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -331,9 +341,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -350,9 +361,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -369,9 +381,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -388,9 +401,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).NotTo(BeNil()) Expect(err).NotTo(BeNil()) }) @@ -403,9 +417,10 @@ var _ = t.Describe("ContainerServer", func() { ) // When - err := sut.LoadSandbox("id") + sb, err := sut.LoadSandbox("id") // Then + Expect(sb).To(BeNil()) Expect(err).NotTo(BeNil()) }) }) diff --git a/internal/lib/remove.go b/internal/lib/remove.go index 15080572f05..dd06e539791 100644 --- a/internal/lib/remove.go +++ b/internal/lib/remove.go @@ -23,8 +23,7 @@ func (c *ContainerServer) Remove(ctx context.Context, container string, force bo return "", errors.Errorf("cannot remove paused container %s", ctrID) case oci.ContainerStateCreated, oci.ContainerStateRunning: if force { - _, err = c.ContainerStop(ctx, container, 10) - if err != nil { + if err = c.StopContainer(ctx, ctr, 10); err != nil { return "", errors.Wrapf(err, "unable to stop container %s", ctrID) } } else { diff --git a/internal/lib/sandbox/sandbox.go b/internal/lib/sandbox/sandbox.go index 38311bc48ec..b2841228d97 100644 --- a/internal/lib/sandbox/sandbox.go +++ b/internal/lib/sandbox/sandbox.go @@ -369,6 +369,11 @@ func (s *Sandbox) createFileInInfraDir(filename string) error { return nil } infra := s.InfraContainer() + // If the infra directory has been cleaned up already, we should not fail to + // create this file. + if _, err := os.Stat(infra.Dir()); os.IsNotExist(err) { + return nil + } f, err := os.Create(filepath.Join(infra.Dir(), filename)) if err == nil { f.Close() diff --git a/internal/lib/stop.go b/internal/lib/stop.go index 13b08a2c152..533d123a608 100644 --- a/internal/lib/stop.go +++ b/internal/lib/stop.go @@ -9,29 +9,22 @@ import ( ) // ContainerStop stops a running container with a grace period (i.e., timeout). -func (c *ContainerServer) ContainerStop(ctx context.Context, container string, timeout int64) (string, error) { - ctr, err := c.LookupContainer(container) - if err != nil { - return "", errors.Wrapf(err, "failed to find container %s", container) - } - ctrID := ctr.ID() - - err = c.runtime.StopContainer(ctx, ctr, timeout) - if err != nil { +func (c *ContainerServer) StopContainer(ctx context.Context, ctr *oci.Container, timeout int64) error { + if err := c.runtime.StopContainer(ctx, ctr, timeout); err != nil { // only fatally error if the error is not that the container was already stopped // we still want to write container state to disk if the container has already // been stopped if err != oci.ErrContainerStopped { - return "", errors.Wrapf(err, "failed to stop container %s", ctrID) + return errors.Wrapf(err, "failed to stop container %s", ctr.ID()) } } else { // we only do these operations if StopContainer didn't fail (even if the failure // was the container already being stopped) if err := c.runtime.WaitContainerStateStopped(ctx, ctr); err != nil { - return "", errors.Wrapf(err, "failed to get container 'stopped' status %s", ctrID) + return errors.Wrapf(err, "failed to get container 'stopped' status %s", ctr.ID()) } - if err := c.storageRuntimeServer.StopContainer(ctrID); err != nil { - return "", errors.Wrapf(err, "failed to unmount container %s", ctrID) + if err := c.storageRuntimeServer.StopContainer(ctr.ID()); err != nil { + return errors.Wrapf(err, "failed to unmount container %s", ctr.ID()) } } @@ -39,5 +32,5 @@ func (c *ContainerServer) ContainerStop(ctx context.Context, container string, t logrus.Warnf("unable to write containers %s state to disk: %v", ctr.ID(), err) } - return ctrID, nil + return nil } diff --git a/internal/lib/stop_test.go b/internal/lib/stop_test.go deleted file mode 100644 index d41a4fafe09..00000000000 --- a/internal/lib/stop_test.go +++ /dev/null @@ -1,26 +0,0 @@ -package lib_test - -import ( - "context" - - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" -) - -// The actual test suite -var _ = t.Describe("ContainerServer", func() { - // Prepare the sut - BeforeEach(beforeEach) - - t.Describe("Stop", func() { - It("should fail on invalid container ID", func() { - // Given - // When - res, err := sut.ContainerStop(context.Background(), "", 0) - - // Then - Expect(err).NotTo(BeNil()) - Expect(res).To(BeEmpty()) - }) - }) -}) diff --git a/internal/oci/runtime_oci.go b/internal/oci/runtime_oci.go index 69b30e7d255..fcf649062d7 100644 --- a/internal/oci/runtime_oci.go +++ b/internal/oci/runtime_oci.go @@ -404,17 +404,6 @@ func (r *runtimeOCI) constructExecCommand(ctx context.Context, c *Container, pro return execCmd } -func createPidFile() (string, error) { - pidFile, err := ioutil.TempFile("", "pidfile") - if err != nil { - return "", err - } - pidFile.Close() - pidFileName := pidFile.Name() - - return pidFileName, nil -} - func killContainerExecProcess(ctx context.Context, pidFile string, cmd *exec.Cmd) { // Attempt to get the container PID and PGID from the file the runtime should have written. ctrPid, ctrPgid, err := pidAndpgidFromFile(pidFile) diff --git a/internal/resourcestore/resourcecleaner.go b/internal/resourcestore/resourcecleaner.go new file mode 100644 index 00000000000..37c9bb784fd --- /dev/null +++ b/internal/resourcestore/resourcecleaner.go @@ -0,0 +1,85 @@ +package resourcestore + +import ( + "context" + "time" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/cri-o/cri-o/internal/log" +) + +// ResourceCleaner is a structure that tracks +// how to cleanup a resource. +// CleanupFuncs can be added to it, and it can be told to +// Cleanup the resource +type ResourceCleaner struct { + funcs []cleanupFunc +} + +// A cleanupFunc is a function that cleans up one piece of +// the associated resource. +type cleanupFunc func() error + +// NewResourceCleaner creates a new ResourceCleaner +func NewResourceCleaner() *ResourceCleaner { + return &ResourceCleaner{} +} + +// Add adds a new CleanupFunc to the ResourceCleaner +func (r *ResourceCleaner) Add( + ctx context.Context, + description string, + fn func() error, +) { + // Create a retry task on top of the provided function + task := func() error { + err := retry(ctx, fn) + if err != nil { + log.Errorf(ctx, + "Retried cleanup function %q too often, giving up", + description, + ) + } + return err + } + + // Prepend reverse iterate by default + r.funcs = append([]cleanupFunc{task}, r.funcs...) +} + +// Cleanup cleans up the resource, running +// the cleanup funcs in opposite chronological order +func (r *ResourceCleaner) Cleanup() error { + for _, f := range r.funcs { + if err := f(); err != nil { + return err + } + } + return nil +} + +// retry attempts to execute fn up to defaultRetryTimes if its failure meets +// retryCondition. +func retry(ctx context.Context, fn func() error) error { + backoff := wait.Backoff{ + Duration: 500 * time.Millisecond, + Factor: 1.5, + Steps: defaultRetryTimes, + } + + waitErr := wait.ExponentialBackoff(backoff, func() (bool, error) { + if err := fn(); err != nil { + log.Errorf(ctx, "Failed to cleanup (probably retrying): %v", err) + return false, nil + } + return true, nil + }) + + if waitErr != nil { + return errors.Wrap(waitErr, "wait on retry") + } + + return nil +} diff --git a/internal/resourcestore/resourcecleaner_defaults.go b/internal/resourcestore/resourcecleaner_defaults.go new file mode 100644 index 00000000000..0b1c475abff --- /dev/null +++ b/internal/resourcestore/resourcecleaner_defaults.go @@ -0,0 +1,7 @@ +// +build !test + +package resourcestore + +// defaultRetryTimes defines the amount of default retries for each cleanup +// function. +var defaultRetryTimes = 20 diff --git a/internal/resourcestore/resourcecleaner_test.go b/internal/resourcestore/resourcecleaner_test.go new file mode 100644 index 00000000000..54048ad93d0 --- /dev/null +++ b/internal/resourcestore/resourcecleaner_test.go @@ -0,0 +1,105 @@ +package resourcestore_test + +import ( + "errors" + + "github.com/cri-o/cri-o/internal/resourcestore" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "golang.org/x/net/context" +) + +// The actual test suite +var _ = t.Describe("ResourceCleaner", func() { + It("should call the cleanup functions", func() { + // Given + sut := resourcestore.NewResourceCleaner() + called1 := false + called2 := false + sut.Add(context.Background(), "test1", func() error { + called1 = true + return nil + }) + sut.Add(context.Background(), "test2", func() error { + called2 = true + return nil + }) + + // When + err := sut.Cleanup() + + // Then + Expect(err).To(BeNil()) + Expect(called1).To(BeTrue()) + Expect(called2).To(BeTrue()) + }) + + It("should retry the cleanup functions", func() { + // Given + sut := resourcestore.NewResourceCleaner() + called1 := false + called2 := false + sut.Add(context.Background(), "test1", func() error { + called1 = true + return nil + }) + failureCnt := 0 + sut.Add(context.Background(), "test2", func() error { + if failureCnt == 2 { + called2 = true + return nil + } + failureCnt++ + return errors.New("") + }) + + // When + err := sut.Cleanup() + + // Then + Expect(err).To(BeNil()) + Expect(called1).To(BeTrue()) + Expect(called2).To(BeTrue()) + Expect(failureCnt).To(Equal(2)) + }) + + It("should retry three times", func() { + // Given + sut := resourcestore.NewResourceCleaner() + failureCnt := 0 + sut.Add(context.Background(), "test", func() error { + failureCnt++ + return errors.New("") + }) + + // When + err := sut.Cleanup() + + // Then + Expect(err).NotTo(BeNil()) + Expect(failureCnt).To(Equal(3)) + }) + + It("should run in parallel", func() { + // Given + sut := resourcestore.NewResourceCleaner() + testChan := make(chan bool, 1) + succ := false + sut.Add(context.Background(), "test1", func() error { + testChan <- true + return nil + }) + sut.Add(context.Background(), "test2", func() error { + <-testChan + succ = true + return nil + }) + + // When + err := sut.Cleanup() + + // Then + Expect(err).To(BeNil()) + Expect(succ).To(BeTrue()) + }) +}) diff --git a/internal/resourcestore/resourcecleaner_test_inject.go b/internal/resourcestore/resourcecleaner_test_inject.go new file mode 100644 index 00000000000..d1fd315b4e3 --- /dev/null +++ b/internal/resourcestore/resourcecleaner_test_inject.go @@ -0,0 +1,9 @@ +// +build test +// All *_inject.go files are meant to be used by tests only. Purpose of this +// files is to provide a way to inject mocked data into the current setup. + +package resourcestore + +// defaultRetryTimes reduces the amount of default retries for testing +// purposes. +var defaultRetryTimes = 3 diff --git a/internal/resourcestore/resourcestore.go b/internal/resourcestore/resourcestore.go index 1d778600a32..abe73afd8ca 100644 --- a/internal/resourcestore/resourcestore.go +++ b/internal/resourcestore/resourcestore.go @@ -27,11 +27,11 @@ type ResourceStore struct { // as well as stores function pointers that pertain to how that resource should be cleaned up, // and keeps track of other requests that are watching for the successful creation of this resource. type Resource struct { - resource IdentifiableCreatable - cleanupFuncs []func() - watchers []chan struct{} - stale bool - name string + resource IdentifiableCreatable + cleaner *ResourceCleaner + watchers []chan struct{} + stale bool + name string } // wasPut checks that a resource has been fully defined yet. @@ -70,7 +70,7 @@ func NewWithTimeout(timeout time.Duration) *ResourceStore { // It runs on a loop, sleeping `sleepTimeBeforeCleanup` between each loop. // A resource will first be marked as stale before being cleaned up. // This means a resource will stay in the store between `sleepTimeBeforeCleanup` and `2*sleepTimeBeforeCleanup`. -// When a resource is cleaned up, it's removed from the store and its cleanupFuncs are called. +// When a resource is cleaned up, it's removed from the store and the cleanup funcs in its cleaner are called. func (rc *ResourceStore) cleanupStaleResources() { for { time.Sleep(rc.timeout) @@ -97,8 +97,8 @@ func (rc *ResourceStore) cleanupStaleResources() { for _, r := range resourcesToReap { logrus.Infof("cleaning up stale resource %s", r.name) - for _, f := range r.cleanupFuncs { - f() + if err := r.cleaner.Cleanup(); err != nil { + logrus.Errorf("Unable to cleanup: %v", err) } } } @@ -130,7 +130,7 @@ func (rc *ResourceStore) Get(name string) string { // a newly created resource, and functions to clean up that newly created resource. // It adds the Resource to the ResourceStore. It expects name to be unique, and // returns an error if a duplicate name is detected. -func (rc *ResourceStore) Put(name string, resource IdentifiableCreatable, cleanupFuncs []func()) error { +func (rc *ResourceStore) Put(name string, resource IdentifiableCreatable, cleaner *ResourceCleaner) error { rc.Lock() defer rc.Unlock() @@ -146,7 +146,7 @@ func (rc *ResourceStore) Put(name string, resource IdentifiableCreatable, cleanu } r.resource = resource - r.cleanupFuncs = cleanupFuncs + r.cleaner = cleaner r.name = name // now the resource is created, notify the watchers diff --git a/internal/resourcestore/resourcestore_test.go b/internal/resourcestore/resourcestore_test.go index e049221a24f..3696b50f753 100644 --- a/internal/resourcestore/resourcestore_test.go +++ b/internal/resourcestore/resourcestore_test.go @@ -6,6 +6,7 @@ import ( "github.com/cri-o/cri-o/internal/resourcestore" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + "golang.org/x/net/context" ) var ( @@ -30,14 +31,14 @@ func (e *entry) SetCreated() { var _ = t.Describe("ResourceStore", func() { // Setup the test var ( - sut *resourcestore.ResourceStore - cleanupFuncs []func() - e *entry + sut *resourcestore.ResourceStore + cleaner *resourcestore.ResourceCleaner + e *entry ) Context("no timeout", func() { BeforeEach(func() { sut = resourcestore.New() - cleanupFuncs = make([]func(), 0) + cleaner = resourcestore.NewResourceCleaner() e = &entry{ id: testID, } @@ -46,7 +47,7 @@ var _ = t.Describe("ResourceStore", func() { // Given // When - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) // Then id := sut.Get(testName) @@ -59,14 +60,14 @@ var _ = t.Describe("ResourceStore", func() { // Given // When - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) // Then - Expect(sut.Put(testName, e, cleanupFuncs)).NotTo(BeNil()) + Expect(sut.Put(testName, e, cleaner)).NotTo(BeNil()) }) It("Get should call SetCreated", func() { // When - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) // Then id := sut.Get(testName) @@ -92,7 +93,7 @@ var _ = t.Describe("ResourceStore", func() { } // When - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) // Then Expect(waitWatcherSet(watcher1)).To(BeTrue()) Expect(waitWatcherSet(watcher2)).To(BeTrue()) @@ -100,7 +101,7 @@ var _ = t.Describe("ResourceStore", func() { }) Context("with timeout", func() { BeforeEach(func() { - cleanupFuncs = make([]func(), 0) + cleaner = resourcestore.NewResourceCleaner() e = &entry{ id: testID, } @@ -111,8 +112,9 @@ var _ = t.Describe("ResourceStore", func() { sut = resourcestore.NewWithTimeout(timeout) timedOutChan := make(chan bool) - cleanupFuncs = append(cleanupFuncs, func() { + cleaner.Add(context.Background(), "test", func() error { timedOutChan <- true + return nil }) go func() { time.Sleep(timeout * 3) @@ -120,7 +122,7 @@ var _ = t.Describe("ResourceStore", func() { }() // When - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) // Then didStoreCallTimeoutFunc := <-timedOutChan @@ -141,7 +143,7 @@ var _ = t.Describe("ResourceStore", func() { // When go func() { time.Sleep(timeout * 6) - Expect(sut.Put(testName, e, cleanupFuncs)).To(BeNil()) + Expect(sut.Put(testName, e, cleaner)).To(BeNil()) timedOutChan <- true }() diff --git a/internal/storage/runtime.go b/internal/storage/runtime.go index d40ec02e1f0..1d7015561d0 100644 --- a/internal/storage/runtime.go +++ b/internal/storage/runtime.go @@ -71,11 +71,6 @@ type RuntimeServer interface { // Pointer arguments can be nil. Either the image name or ID can be // omitted, but not both. All other arguments are required. CreatePodSandbox(systemContext *types.SystemContext, podName, podID, imageName, imageAuthFile, imageID, containerName, metadataName, uid, namespace string, attempt uint32, idMappingsOptions *storage.IDMappingOptions, labelOptions []string, privileged bool) (ContainerInfo, error) - // RemovePodSandbox deletes a pod sandbox's infrastructure container. - // The CRI expects that a sandbox can't be removed unless its only - // container is its infrastructure container, but we don't enforce that - // here, since we're just keeping track of it for higher level APIs. - RemovePodSandbox(idOrName string) error // GetContainerMetadata returns the metadata we've stored for a container. GetContainerMetadata(idOrName string) (RuntimeContainerMetadata, error) @@ -382,34 +377,15 @@ func (r *runtimeService) deleteLayerIfMapped(imageID, layerID string) { } } -func (r *runtimeService) RemovePodSandbox(idOrName string) error { - container, err := r.storageImageServer.GetStore().Container(idOrName) - if err != nil { - if errors.Is(err, storage.ErrContainerUnknown) { - return ErrInvalidSandboxID - } - return err - } - layer, err := r.storageImageServer.GetStore().Layer(container.LayerID) - if err != nil { - logrus.Debugf("failed to retrieve layer %q: %v", container.LayerID, err) - } - err = r.storageImageServer.GetStore().DeleteContainer(container.ID) - if err != nil { - logrus.Debugf("failed to delete pod sandbox %q: %v", container.ID, err) - return err - } - if layer != nil { - r.deleteLayerIfMapped(container.ImageID, layer.Parent) - } - return nil -} - func (r *runtimeService) DeleteContainer(idOrName string) error { if idOrName == "" { return ErrInvalidContainerID } container, err := r.storageImageServer.GetStore().Container(idOrName) + // Already deleted + if errors.Is(err, storage.ErrContainerUnknown) { + return nil + } if err != nil { return err } diff --git a/internal/storage/runtime_test.go b/internal/storage/runtime_test.go index d4122c4d74b..6665732751b 100644 --- a/internal/storage/runtime_test.go +++ b/internal/storage/runtime_test.go @@ -467,79 +467,6 @@ var _ = t.Describe("Runtime", func() { }) }) - t.Describe("RemovePodSandbox", func() { - It("should succeed to remove the pod sandbox", func() { - // Given - gomock.InOrder( - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Container(gomock.Any()). - Return(&cs.Container{}, nil), - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Layer("").Return(nil, nil), - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().DeleteContainer(gomock.Any()). - Return(nil), - ) - - // When - err := sut.RemovePodSandbox("id") - - // Then - Expect(err).To(BeNil()) - }) - - It("should fail to remove the pod sandbox on store error", func() { - // Given - gomock.InOrder( - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Container(gomock.Any()). - Return(nil, t.TestError), - ) - - // When - err := sut.RemovePodSandbox("id") - - // Then - Expect(err).NotTo(BeNil()) - }) - - It("should fail to remove the pod sandbox on invalid sandbox ID", func() { - // Given - gomock.InOrder( - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Container(gomock.Any()). - Return(nil, cs.ErrContainerUnknown), - ) - - // When - err := sut.RemovePodSandbox("id") - - // Then - Expect(err).NotTo(BeNil()) - Expect(err).To(Equal(storage.ErrInvalidSandboxID)) - }) - - It("should fail to remove the pod sandbox on deletion error", func() { - // Given - gomock.InOrder( - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Container(gomock.Any()). - Return(&cs.Container{}, nil), - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().Layer("").Return(nil, nil), - imageServerMock.EXPECT().GetStore().Return(storeMock), - storeMock.EXPECT().DeleteContainer(gomock.Any()). - Return(t.TestError), - ) - - // When - err := sut.RemovePodSandbox("id") - - // Then - Expect(err).NotTo(BeNil()) - }) - }) - t.Describe("CreateContainer/CreatePodSandbox", func() { t.Describe("success", func() { var ( diff --git a/pkg/config/config.go b/pkg/config/config.go index 11c07cdf7a0..920c184fd72 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -147,6 +147,10 @@ type RootConfig struct { // VersionFilePersist is the location CRI-O will lay down the version file // that checks whether we've upgraded VersionFilePersist string `toml:"version_file_persist"` + + // InternalWipe is whether CRI-O should wipe containers and images after a reboot when the server starts. + // If set to false, one must use the external command `crio wipe` to wipe the containers and images in these situations. + InternalWipe bool `toml:"internal_wipe"` } // RuntimeHandler represents each item of the "crio.runtime.runtimes" TOML diff --git a/pkg/config/template.go b/pkg/config/template.go index a49cde6b50d..c100e997612 100644 --- a/pkg/config/template.go +++ b/pkg/config/template.go @@ -62,6 +62,10 @@ version_file = "{{ .VersionFile }}" # only happen when CRI-O has been upgraded version_file_persist = "{{ .VersionFilePersist }}" +# InternalWipe is whether CRI-O should wipe containers and images after a reboot when the server starts. +# If set to false, one must use the external command 'crio wipe' to wipe the containers and images in these situations. +internal_wipe = {{ .InternalWipe }} + # The crio.api table contains settings for the kubelet/gRPC interface. [crio.api] diff --git a/server/container_create.go b/server/container_create.go index adcc8809dcd..d55af1c0573 100644 --- a/server/container_create.go +++ b/server/container_create.go @@ -15,6 +15,7 @@ import ( "github.com/containers/storage/pkg/stringid" "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" + "github.com/cri-o/cri-o/internal/resourcestore" "github.com/cri-o/cri-o/internal/storage" "github.com/cri-o/cri-o/pkg/config" "github.com/cri-o/cri-o/pkg/container" @@ -28,6 +29,9 @@ import ( pb "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) +// sync with https://github.com/containers/storage/blob/7fe03f6c765f2adbc75a5691a1fb4f19e56e7071/pkg/truncindex/truncindex.go#L92 +const noSuchID = "no such id" + type orderedMounts []rspec.Mount // Len returns the number of mounts. Used in sorting. @@ -476,14 +480,14 @@ func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerReq return nil, errors.Wrap(err, "setting container name and ID") } - cleanupFuncs := make([]func(), 0) + resourceCleaner := resourcestore.NewResourceCleaner() defer func() { // no error, no need to cleanup if retErr == nil || isContextError(retErr) { return } - for i := len(cleanupFuncs) - 1; i >= 0; i-- { - cleanupFuncs[i]() + if err := resourceCleaner.Cleanup(); err != nil { + log.Errorf(ctx, "Unable to cleanup: %v", err) } }() @@ -495,37 +499,50 @@ func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerReq return nil, errors.Wrapf(err, resourceErr.Error()) } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "createCtr: releasing container name %s", ctr.Name()) + description := fmt.Sprintf("createCtr: releasing container name %s", ctr.Name()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) s.ReleaseContainerName(ctr.Name()) + return nil }) newContainer, err := s.createSandboxContainer(ctx, ctr, sb) if err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "createCtr: deleting container %s from storage", ctr.ID()) + description = fmt.Sprintf("createCtr: deleting container %s from storage", ctr.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) err2 := s.StorageRuntimeServer().DeleteContainer(ctr.ID()) if err2 != nil { log.Warnf(ctx, "Failed to cleanup container storage: %v", err2) } + return err2 }) s.addContainer(newContainer) - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "createCtr: removing container %s", newContainer.ID()) + description = fmt.Sprintf("createCtr: removing container %s", newContainer.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) s.removeContainer(newContainer) + return nil }) if err := s.CtrIDIndex().Add(ctr.ID()); err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "createCtr: deleting container ID %s from idIndex", ctr.ID()) - if err := s.CtrIDIndex().Delete(ctr.ID()); err != nil { - log.Warnf(ctx, "couldn't delete ctr id %s from idIndex", ctr.ID()) + description = fmt.Sprintf("createCtr: deleting container ID %s from idIndex", ctr.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err := s.CtrIDIndex().Delete(ctr.ID()) + if err != nil { + // already deleted + if strings.Contains(err.Error(), noSuchID) { + return nil + } + log.Warnf(ctx, "Couldn't delete ctr id %s from idIndex", ctr.ID()) } + return err }) mappings, err := s.getSandboxIDMappings(sb) @@ -536,13 +553,16 @@ func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerReq if err := s.createContainerPlatform(newContainer, sb.CgroupParent(), mappings); err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { + description = fmt.Sprintf("createCtr: removing container ID %s from runtime", ctr.ID()) + resourceCleaner.Add(ctx, description, func() error { if retErr != nil { - log.Infof(ctx, "createCtr: removing container ID %s from runtime", ctr.ID()) + log.Infof(ctx, description) if err := s.Runtime().DeleteContainer(newContainer); err != nil { log.Warnf(ctx, "failed to delete container in runtime %s: %v", ctr.ID(), err) + return err } } + return nil }) if err := s.ContainerStateToDisk(newContainer); err != nil { @@ -550,7 +570,7 @@ func (s *Server) CreateContainer(ctx context.Context, req *pb.CreateContainerReq } if isContextError(ctx.Err()) { - if err := s.resourceStore.Put(ctr.Name(), newContainer, cleanupFuncs); err != nil { + if err := s.resourceStore.Put(ctr.Name(), newContainer, resourceCleaner); err != nil { log.Errorf(ctx, "createCtr: failed to save progress of container %s: %v", newContainer.ID(), err) } log.Infof(ctx, "createCtr: context was either canceled or the deadline was exceeded: %v", ctx.Err()) diff --git a/server/container_stop.go b/server/container_stop.go index bf3bf619f1b..15f2056cc59 100644 --- a/server/container_stop.go +++ b/server/container_stop.go @@ -15,7 +15,6 @@ import ( // StopContainer stops a running container with a grace period (i.e., timeout). func (s *Server) StopContainer(ctx context.Context, req *pb.StopContainerRequest) (*pb.StopContainerResponse, error) { log.Infof(ctx, "Stopping container: %s (timeout: %ds)", req.ContainerId, req.Timeout) - // save container description to print c, err := s.GetContainerFromShortID(req.ContainerId) if err != nil { return nil, status.Errorf(codes.NotFound, "could not find container %q: %v", req.ContainerId, err) @@ -33,8 +32,7 @@ func (s *Server) StopContainer(ctx context.Context, req *pb.StopContainerRequest } } - _, err = s.ContainerServer.ContainerStop(ctx, req.ContainerId, req.Timeout) - if err != nil { + if err := s.ContainerServer.StopContainer(ctx, c, req.Timeout); err != nil { return nil, err } diff --git a/server/image_remove.go b/server/image_remove.go index cb24484c648..2665d466fcc 100644 --- a/server/image_remove.go +++ b/server/image_remove.go @@ -11,21 +11,28 @@ import ( // RemoveImage removes the image. func (s *Server) RemoveImage(ctx context.Context, req *pb.RemoveImageRequest) (*pb.RemoveImageResponse, error) { - image := "" - img := req.GetImage() + imageRef := "" + img := req.Image if img != nil { - image = img.Image + imageRef = img.Image } - if image == "" { + if imageRef == "" { return nil, fmt.Errorf("no image specified") } + if err := s.removeImage(ctx, imageRef); err != nil { + return nil, err + } + return &pb.RemoveImageResponse{}, nil +} + +func (s *Server) removeImage(ctx context.Context, imageRef string) error { var deleted bool - images, err := s.StorageImageServer().ResolveNames(s.config.SystemContext, image) + images, err := s.StorageImageServer().ResolveNames(s.config.SystemContext, imageRef) if err != nil { if err == storage.ErrCannotParseImageID { - images = append(images, image) + images = append(images, imageRef) } else { - return nil, err + return err } } for _, img := range images { @@ -38,7 +45,7 @@ func (s *Server) RemoveImage(ctx context.Context, req *pb.RemoveImageRequest) (* break } if !deleted && err != nil { - return nil, err + return err } - return &pb.RemoveImageResponse{}, nil + return nil } diff --git a/server/sandbox_network.go b/server/sandbox_network.go index 888cfa8fc02..eadaa28b1c6 100644 --- a/server/sandbox_network.go +++ b/server/sandbox_network.go @@ -3,6 +3,7 @@ package server import ( "context" "fmt" + "math" "time" cnitypes "github.com/containernetworking/cni/pkg/types" @@ -11,7 +12,9 @@ import ( "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" "github.com/cri-o/cri-o/server/metrics" + "github.com/cri-o/ocicni/pkg/ocicni" "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/api/resource" utilnet "k8s.io/utils/net" ) @@ -181,3 +184,51 @@ func (s *Server) networkStop(ctx context.Context, sb *sandbox.Sandbox) error { return sb.SetNetworkStopped(true) } + +func (s *Server) newPodNetwork(sb *sandbox.Sandbox) (ocicni.PodNetwork, error) { + var egress, ingress int64 = 0, 0 + + if val, ok := sb.Annotations()["kubernetes.io/egress-bandwidth"]; ok { + egressQ, err := resource.ParseQuantity(val) + if err != nil { + return ocicni.PodNetwork{}, fmt.Errorf("failed to parse egress bandwidth: %v", err) + } else if iegress, isok := egressQ.AsInt64(); isok { + egress = iegress + } + } + if val, ok := sb.Annotations()["kubernetes.io/ingress-bandwidth"]; ok { + ingressQ, err := resource.ParseQuantity(val) + if err != nil { + return ocicni.PodNetwork{}, fmt.Errorf("failed to parse ingress bandwidth: %v", err) + } else if iingress, isok := ingressQ.AsInt64(); isok { + ingress = iingress + } + } + + var bwConfig *ocicni.BandwidthConfig + + if ingress > 0 || egress > 0 { + bwConfig = &ocicni.BandwidthConfig{} + if ingress > 0 { + bwConfig.IngressRate = uint64(ingress) + bwConfig.IngressBurst = math.MaxUint32 * 8 // 4GB burst limit + } + if egress > 0 { + bwConfig.EgressRate = uint64(egress) + bwConfig.EgressBurst = math.MaxUint32 * 8 // 4GB burst limit + } + } + + network := s.config.CNIPlugin().GetDefaultNetworkName() + return ocicni.PodNetwork{ + Name: sb.KubeName(), + Namespace: sb.Namespace(), + UID: sb.Metadata().UID, + Networks: []ocicni.NetAttachment{}, + ID: sb.ID(), + NetNS: sb.NetNsPath(), + RuntimeConfig: map[string]ocicni.RuntimeConfig{ + network: {Bandwidth: bwConfig}, + }, + }, nil +} diff --git a/server/sandbox_remove.go b/server/sandbox_remove.go index 69a497f0b2a..8e13c832b3d 100644 --- a/server/sandbox_remove.go +++ b/server/sandbox_remove.go @@ -7,7 +7,6 @@ import ( "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" oci "github.com/cri-o/cri-o/internal/oci" - pkgstorage "github.com/cri-o/cri-o/internal/storage" "github.com/pkg/errors" "golang.org/x/net/context" pb "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" @@ -32,88 +31,83 @@ func (s *Server) RemovePodSandbox(ctx context.Context, req *pb.RemovePodSandboxR log.Warnf(ctx, "could not get sandbox %s, it's probably been removed already: %v", req.PodSandboxId, err) return &pb.RemovePodSandboxResponse{}, nil } + if err := s.removePodSandbox(ctx, sb); err != nil { + return nil, err + } + return &pb.RemovePodSandboxResponse{}, nil +} - podInfraContainer := sb.InfraContainer() +func (s *Server) removePodSandbox(ctx context.Context, sb *sandbox.Sandbox) error { containers := sb.Containers().List() - containers = append(containers, podInfraContainer) // Delete all the containers in the sandbox for _, c := range containers { - if !sb.Stopped() { - cState := c.State() - if cState.Status == oci.ContainerStateCreated || cState.Status == oci.ContainerStateRunning { - timeout := int64(10) - if err := s.Runtime().StopContainer(ctx, c, timeout); err != nil { - // Assume container is already stopped - log.Warnf(ctx, "failed to stop container %s: %v", c.Name(), err) - } - if err := s.Runtime().WaitContainerStateStopped(ctx, c); err != nil { - return nil, fmt.Errorf("failed to get container 'stopped' status %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) - } - } - } - - if err := s.Runtime().DeleteContainer(c); err != nil { - return nil, fmt.Errorf("failed to delete container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) - } - - if c.ID() == podInfraContainer.ID() { - continue - } - - c.CleanupConmonCgroup() - - if err := s.StorageRuntimeServer().StopContainer(c.ID()); err != nil && err != storage.ErrContainerUnknown { - // assume container already umounted - log.Warnf(ctx, "failed to stop container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) - } - if err := s.StorageRuntimeServer().DeleteContainer(c.ID()); err != nil && err != storage.ErrContainerUnknown { - return nil, fmt.Errorf("failed to delete container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) - } - - s.ReleaseContainerName(c.Name()) - s.removeContainer(c) - if err := s.CtrIDIndex().Delete(c.ID()); err != nil { - return nil, fmt.Errorf("failed to delete container %s in pod sandbox %s from index: %v", c.Name(), sb.ID(), err) + if err := s.removeContainerInPod(ctx, sb, c); err != nil { + return err } } - s.removeInfraContainer(podInfraContainer) - podInfraContainer.CleanupConmonCgroup() - - // StorageRuntimeServer won't know about this container, as it wasn't created in storage - if !podInfraContainer.Spoofed() { - if err := s.StorageRuntimeServer().StopContainer(sb.ID()); err != nil && !errors.Is(err, storage.ErrContainerUnknown) { - log.Warnf(ctx, "failed to stop sandbox container in pod sandbox %s: %v", sb.ID(), err) - } + if err := sb.UnmountShm(); err != nil { + return errors.Wrap(err, "unable to unmount SHM") } - if err := sb.UnmountShm(); err != nil { - return nil, errors.Wrap(err, "unable to unmount SHM") + s.removeInfraContainer(sb.InfraContainer()) + if err := s.removeContainerInPod(ctx, sb, sb.InfraContainer()); err != nil { + return err } - if err := s.StorageRuntimeServer().RemovePodSandbox(sb.ID()); err != nil && err != pkgstorage.ErrInvalidSandboxID { - return nil, fmt.Errorf("failed to remove pod sandbox %s: %v", sb.ID(), err) + // Cleanup network resources for this pod + if err := s.networkStop(ctx, sb); err != nil { + return errors.Wrap(err, "stop pod network") } + if s.config.ManageNSLifecycle { if err := sb.RemoveManagedNamespaces(); err != nil { - return nil, errors.Wrap(err, "unable to remove managed namespaces") + return errors.Wrap(err, "unable to remove managed namespaces") } } - s.ReleaseContainerName(podInfraContainer.Name()) - if err := s.CtrIDIndex().Delete(podInfraContainer.ID()); err != nil { - return nil, fmt.Errorf("failed to delete infra container %s in pod sandbox %s from index: %v", podInfraContainer.ID(), sb.ID(), err) - } - s.ReleasePodName(sb.Name()) if err := s.removeSandbox(sb.ID()); err != nil { log.Warnf(ctx, "failed to remove sandbox: %v", err) } if err := s.PodIDIndex().Delete(sb.ID()); err != nil { - return nil, fmt.Errorf("failed to delete pod sandbox %s from index: %v", sb.ID(), err) + return fmt.Errorf("failed to delete pod sandbox %s from index: %v", sb.ID(), err) } log.Infof(ctx, "Removed pod sandbox: %s", sb.ID()) - return &pb.RemovePodSandboxResponse{}, nil + return nil +} + +func (s *Server) removeContainerInPod(ctx context.Context, sb *sandbox.Sandbox, c *oci.Container) error { + if !sb.Stopped() { + if err := s.ContainerServer.StopContainer(ctx, c, int64(10)); err != nil { + return errors.Errorf("failed to stop container for removal") + } + } + + if err := s.Runtime().DeleteContainer(c); err != nil { + return fmt.Errorf("failed to delete container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) + } + + c.CleanupConmonCgroup() + + if !c.Spoofed() { + if err := s.StorageRuntimeServer().StopContainer(c.ID()); err != nil && err != storage.ErrContainerUnknown { + // assume container already umounted + log.Warnf(ctx, "failed to stop container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) + } + if err := s.StorageRuntimeServer().DeleteContainer(c.ID()); err != nil && err != storage.ErrContainerUnknown { + return fmt.Errorf("failed to delete container %s in pod sandbox %s: %v", c.Name(), sb.ID(), err) + } + } + + s.ReleaseContainerName(c.Name()) + s.removeContainer(c) + if err := s.CtrIDIndex().Delete(c.ID()); err != nil { + return fmt.Errorf("failed to delete container %s in pod sandbox %s from index: %v", c.Name(), sb.ID(), err) + } + sb.RemoveContainer(c) + + return nil } diff --git a/server/sandbox_run_linux.go b/server/sandbox_run_linux.go index 41b2284ca8d..4ea924e74e9 100644 --- a/server/sandbox_run_linux.go +++ b/server/sandbox_run_linux.go @@ -24,6 +24,7 @@ import ( libsandbox "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" oci "github.com/cri-o/cri-o/internal/oci" + "github.com/cri-o/cri-o/internal/resourcestore" ann "github.com/cri-o/cri-o/pkg/annotations" libconfig "github.com/cri-o/cri-o/pkg/config" "github.com/cri-o/cri-o/pkg/sandbox" @@ -308,14 +309,14 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, errors.Wrap(err, "setting pod sandbox name and id") } - cleanupFuncs := make([]func(), 0) + resourceCleaner := resourcestore.NewResourceCleaner() defer func() { // no error, no need to cleanup if retErr == nil || isContextError(retErr) { return } - for i := len(cleanupFuncs) - 1; i >= 0; i-- { - cleanupFuncs[i]() + if err := resourceCleaner.Cleanup(); err != nil { + log.Errorf(ctx, "Unable to cleanup: %v", err) } }() @@ -327,9 +328,11 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, errors.Wrapf(err, resourceErr.Error()) } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: releasing pod sandbox name: %s", sbox.Name()) + description := fmt.Sprintf("runSandbox: releasing pod sandbox name: %s", sbox.Name()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) s.ReleasePodName(sbox.Name()) + return nil }) kubeAnnotations := sbox.Config().GetAnnotations() @@ -351,9 +354,11 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: releasing container name: %s", containerName) + description = fmt.Sprintf("runSandbox: releasing container name: %s", containerName) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) s.ReleaseContainerName(containerName) + return nil }) var labelOptions []string @@ -389,11 +394,14 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if err != nil { return nil, fmt.Errorf("error creating pod sandbox with name %q: %v", sbox.Name(), err) } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: removing pod sandbox from storage: %s", sbox.ID()) - if err2 := s.StorageRuntimeServer().RemovePodSandbox(sbox.ID()); err2 != nil { + description = fmt.Sprintf("runSandbox: removing pod sandbox from storage: %s", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err2 := s.StorageRuntimeServer().DeleteContainer(sbox.ID()) + if err2 != nil { log.Warnf(ctx, "could not cleanup pod sandbox %q: %v", sbox.ID(), err2) } + return err2 }) // set log directory @@ -548,11 +556,14 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, err } pathsToChown = append(pathsToChown, shmPath) - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: unmounting shmPath for sandbox %s", sbox.ID()) - if err2 := unix.Unmount(shmPath, unix.MNT_DETACH); err2 != nil { + description = fmt.Sprintf("runSandbox: unmounting shmPath for sandbox %s", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err2 := unix.Unmount(shmPath, unix.MNT_DETACH) + if err2 != nil { log.Warnf(ctx, "failed to unmount shm for sandbox: %v", err2) } + return err2 }) } @@ -574,11 +585,18 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: deleting container ID from idIndex for sandbox %s", sbox.ID()) - if err2 := s.CtrIDIndex().Delete(sbox.ID()); err2 != nil { - log.Warnf(ctx, "could not delete ctr id %s from idIndex", sbox.ID()) + description = fmt.Sprintf("runSandbox: deleting container ID from idIndex for sandbox %s", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err2 := s.CtrIDIndex().Delete(sbox.ID()) + if err2 != nil { + // already deleted + if strings.Contains(err2.Error(), noSuchID) { + return nil + } + log.Warnf(ctx, "Could not delete ctr id %s from idIndex", sbox.ID()) } + return err2 }) // set log path inside log directory @@ -671,22 +689,28 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if err := s.addSandbox(sb); err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: removing pod sandbox %s", sbox.ID()) - if err := s.removeSandbox(sbox.ID()); err != nil { + description = fmt.Sprintf("runSandbox: removing pod sandbox %s", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err := s.removeSandbox(sbox.ID()) + if err != nil { log.Warnf(ctx, "could not remove pod sandbox: %v", err) } + return err }) if err := s.PodIDIndex().Add(sbox.ID()); err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: deleting pod ID %s from idIndex", sbox.ID()) - if err := s.PodIDIndex().Delete(sbox.ID()); err != nil { + description = fmt.Sprintf("runSandbox: deleting pod ID %s from idIndex", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err := s.PodIDIndex().Delete(sbox.ID()) + if err != nil { log.Warnf(ctx, "could not delete pod id %s from idIndex", sbox.ID()) } + return err }) for k, v := range kubeAnnotations { @@ -702,15 +726,19 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest // set up namespaces nsCleanupFuncs, err := s.configureGeneratorForSandboxNamespaces(hostNetwork, hostIPC, hostPID, sandboxIDMappings, sysctls, sb, g) // We want to cleanup after ourselves if we are managing any namespaces and fail in this function. - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: cleaning up namespaces after failing to run sandbox %s", sbox.ID()) + nsCleanupDescription := fmt.Sprintf("runSandbox: cleaning up namespaces after failing to run sandbox %s", sbox.ID()) + nsCleanupFunc := func() error { + log.Infof(ctx, description) for idx := range nsCleanupFuncs { if err2 := nsCleanupFuncs[idx](); err2 != nil { log.Infof(ctx, "runSandbox: failed to cleanup namespace: %s", err2.Error()) + return err2 } } - }) + return nil + } if err != nil { + resourceCleaner.Add(ctx, nsCleanupDescription, nsCleanupFunc) return nil, err } @@ -721,14 +749,21 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if s.config.ManageNSLifecycle { ips, result, err = s.networkStart(ctx, sb) if err != nil { + resourceCleaner.Add(ctx, nsCleanupDescription, nsCleanupFunc) return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: in manageNSLifecycle, stopping network for sandbox %s", sb.ID()) + description = fmt.Sprintf("runSandbox: stopping network for sandbox %s", sb.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) // use a new context to prevent an expired context from preventing a stop - if err2 := s.networkStop(context.Background(), sb); err2 != nil { - log.Errorf(ctx, "error stopping network on cleanup: %v", err2) + if err := s.networkStop(context.Background(), sb); err != nil { + log.Errorf(ctx, "error stopping network on cleanup: %v", err) + return err } + + // Now that we've succeeded in stopping the network, cleanup namespaces + log.Infof(ctx, nsCleanupDescription) + return nsCleanupFunc() }) if result != nil { resultCurrent, err := current.NewResultFromResult(result) @@ -760,11 +795,14 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if err != nil { return nil, fmt.Errorf("failed to mount container %s in pod sandbox %s(%s): %v", containerName, sb.Name(), sbox.ID(), err) } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: stopping storage container for sandbox %s", sbox.ID()) - if err2 := s.StorageRuntimeServer().StopContainer(sbox.ID()); err2 != nil { + description = fmt.Sprintf("runSandbox: stopping storage container for sandbox %s", sbox.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err2 := s.StorageRuntimeServer().StopContainer(sbox.ID()) + if err2 != nil { log.Warnf(ctx, "could not stop storage container: %v: %v", sbox.ID(), err2) } + return err2 }) g.AddAnnotation(annotations.MountPoint, mountPoint) @@ -901,9 +939,11 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest } s.addInfraContainer(container) - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: removing infra container %s", container.ID()) + description = fmt.Sprintf("runSandbox: removing infra container %s", container.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) s.removeInfraContainer(container) + return nil }) if sandboxIDMappings != nil { @@ -926,23 +966,25 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { + description = fmt.Sprintf("runSandbox: stopping container %s", container.ID()) + resourceCleaner.Add(ctx, description, func() error { // Clean-up steps from RemovePodSanbox - log.Infof(ctx, "runSandbox: stopping container %s", container.ID()) - if err2 := s.Runtime().StopContainer(ctx, container, int64(10)); err2 != nil { - log.Warnf(ctx, "failed to stop container %s: %v", container.Name(), err2) - } - if err2 := s.Runtime().WaitContainerStateStopped(ctx, container); err2 != nil { - log.Warnf(ctx, "failed to get container 'stopped' status %s in pod sandbox %s: %v", container.Name(), sb.ID(), err2) + log.Infof(ctx, description) + if err := s.ContainerServer.StopContainer(ctx, container, int64(10)); err != nil { + return errors.Errorf("failed to stop container for removal") } + log.Infof(ctx, "runSandbox: deleting container %s", container.ID()) if err2 := s.Runtime().DeleteContainer(container); err2 != nil { log.Warnf(ctx, "failed to delete container %s in pod sandbox %s: %v", container.Name(), sb.ID(), err2) + return err2 } log.Infof(ctx, "runSandbox: writing container %s state to disk", container.ID()) if err2 := s.ContainerStateToDisk(container); err2 != nil { log.Warnf(ctx, "failed to write container state %s in pod sandbox %s: %v", container.Name(), sb.ID(), err2) + return err2 } + return nil }) if err := s.ContainerStateToDisk(container); err != nil { @@ -954,12 +996,14 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest if err != nil { return nil, err } - cleanupFuncs = append(cleanupFuncs, func() { - log.Infof(ctx, "runSandbox: stopping network for sandbox %s when not manageNSLifecycle", sb.ID()) - // use a new context to prevent an expired context from preventing a stop - if err2 := s.networkStop(context.Background(), sb); err2 != nil { + description = fmt.Sprintf("runSandbox: stopping network for sandbox %s when not manage NS", sb.ID()) + resourceCleaner.Add(ctx, description, func() error { + log.Infof(ctx, description) + err2 := s.networkStop(context.Background(), sb) + if err2 != nil { log.Errorf(ctx, "error stopping network on cleanup: %v", err2) } + return err2 }) } @@ -969,7 +1013,7 @@ func (s *Server) runPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest sb.AddIPs(ips) if isContextError(ctx.Err()) { - if err := s.resourceStore.Put(sbox.Name(), sb, cleanupFuncs); err != nil { + if err := s.resourceStore.Put(sbox.Name(), sb, resourceCleaner); err != nil { log.Errorf(ctx, "runSandbox: failed to save progress of sandbox %s: %v", sbox.ID(), err) } log.Infof(ctx, "runSandbox: context was either canceled or the deadline was exceeded: %v", ctx.Err()) diff --git a/server/sandbox_run_test.go b/server/sandbox_run_test.go index eacf20d0a70..cdc5e6d4533 100644 --- a/server/sandbox_run_test.go +++ b/server/sandbox_run_test.go @@ -50,7 +50,7 @@ var _ = t.Describe("RunPodSandbox", func() { Return("", nil), runtimeServerMock.EXPECT().StopContainer(gomock.Any()). Return(nil), - runtimeServerMock.EXPECT().RemovePodSandbox(gomock.Any()). + runtimeServerMock.EXPECT().DeleteContainer(gomock.Any()). Return(nil), ) @@ -125,7 +125,7 @@ var _ = t.Describe("RunPodSandbox", func() { gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(storage.ContainerInfo{}, nil), - runtimeServerMock.EXPECT().RemovePodSandbox(gomock.Any()). + runtimeServerMock.EXPECT().DeleteContainer(gomock.Any()). Return(nil), ) diff --git a/server/sandbox_stop.go b/server/sandbox_stop.go index 8c22bcbe47f..406d20e00b0 100644 --- a/server/sandbox_stop.go +++ b/server/sandbox_stop.go @@ -1,6 +1,9 @@ package server import ( + "fmt" + + "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" "golang.org/x/net/context" pb "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" @@ -10,17 +13,35 @@ import ( // sandbox, they should be force terminated. func (s *Server) StopPodSandbox(ctx context.Context, req *pb.StopPodSandboxRequest) (*pb.StopPodSandboxResponse, error) { // platform dependent call - return s.stopPodSandbox(ctx, req) + log.Infof(ctx, "Stopping pod sandbox: %s", req.PodSandboxId) + sb, err := s.getPodSandboxFromRequest(req.PodSandboxId) + if err != nil { + if err == sandbox.ErrIDEmpty { + return nil, err + } + if err == errSandboxNotCreated { + return nil, fmt.Errorf("StopPodSandbox failed as the sandbox is not created: %s", sb.ID()) + } + + // If the sandbox isn't found we just return an empty response to adhere + // the CRI interface which expects to not error out in not found + // cases. + + log.Warnf(ctx, "could not get sandbox %s, it's probably been stopped already: %v", req.PodSandboxId, err) + log.Debugf(ctx, "StopPodSandboxResponse %s", req.PodSandboxId) + return &pb.StopPodSandboxResponse{}, nil + } + if err := s.stopPodSandbox(ctx, sb); err != nil { + return nil, err + } + return &pb.StopPodSandboxResponse{}, nil } // stopAllPodSandboxes removes all pod sandboxes func (s *Server) stopAllPodSandboxes(ctx context.Context) { log.Debugf(ctx, "stopAllPodSandboxes") for _, sb := range s.ContainerServer.ListSandboxes() { - pod := &pb.StopPodSandboxRequest{ - PodSandboxId: sb.ID(), - } - if _, err := s.StopPodSandbox(ctx, pod); err != nil { + if err := s.stopPodSandbox(ctx, sb); err != nil { log.Warnf(ctx, "could not StopPodSandbox %s: %v", sb.ID(), err) } } diff --git a/server/sandbox_stop_linux.go b/server/sandbox_stop_linux.go index 924db3c3a2a..cc6af408bd1 100644 --- a/server/sandbox_stop_linux.go +++ b/server/sandbox_stop_linux.go @@ -13,47 +13,27 @@ import ( "github.com/pkg/errors" "golang.org/x/net/context" "golang.org/x/sync/errgroup" - pb "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) -func (s *Server) stopPodSandbox(ctx context.Context, req *pb.StopPodSandboxRequest) (*pb.StopPodSandboxResponse, error) { - log.Infof(ctx, "Stopping pod sandbox: %s", req.GetPodSandboxId()) - sb, err := s.getPodSandboxFromRequest(req.PodSandboxId) - resp := &pb.StopPodSandboxResponse{} - if err != nil { - if err == sandbox.ErrIDEmpty { - return nil, err - } - if err == errSandboxNotCreated { - return nil, fmt.Errorf("StopPodSandbox failed as the sandbox is not created: %s", sb.ID()) - } - - // If the sandbox isn't found we just return an empty response to adhere - // the CRI interface which expects to not error out in not found - // cases. - - log.Warnf(ctx, "could not get sandbox %s, it's probably been stopped already: %v", req.PodSandboxId, err) - log.Debugf(ctx, "StopPodSandboxResponse %s: %+v", req.PodSandboxId, resp) - return resp, nil - } +func (s *Server) stopPodSandbox(ctx context.Context, sb *sandbox.Sandbox) error { stopMutex := sb.StopMutex() stopMutex.Lock() defer stopMutex.Unlock() // Clean up sandbox networking and close its network namespace. if err := s.networkStop(ctx, sb); err != nil { - return nil, err + return err + } + + if sb.Stopped() { + log.Infof(ctx, "Stopped pod sandbox (already stopped): %s", sb.ID()) + return nil } // Get high-performance runtime hook to trigger preStop step for each container hooks, err := runtimehandlerhooks.GetRuntimeHandlerHooks(ctx, &s.config, sb.RuntimeHandler(), s.Runtime()) if err != nil { - return nil, fmt.Errorf("failed to get runtime handler %q hooks", sb.RuntimeHandler()) - } - - if sb.Stopped() { - log.Infof(ctx, "Stopped pod sandbox (already stopped): %s", sb.ID()) - return resp, nil + return fmt.Errorf("failed to get runtime handler %q hooks", sb.RuntimeHandler()) } podInfraContainer := sb.InfraContainer() @@ -95,7 +75,7 @@ func (s *Server) stopPodSandbox(ctx context.Context, req *pb.StopPodSandboxReque } } if err := waitGroup.Wait(); err != nil { - return nil, err + return err } } @@ -103,13 +83,13 @@ func (s *Server) stopPodSandbox(ctx context.Context, req *pb.StopPodSandboxReque podInfraStatus := podInfraContainer.State() if podInfraStatus.Status != oci.ContainerStateStopped { if err := s.StopContainerAndWait(ctx, podInfraContainer, int64(10)); err != nil { - return nil, fmt.Errorf("failed to stop infra container for pod sandbox %s: %v", sb.ID(), err) + return fmt.Errorf("failed to stop infra container for pod sandbox %s: %v", sb.ID(), err) } } } if err := sb.UnmountShm(); err != nil { - return nil, err + return err } if err := s.StorageRuntimeServer().StopContainer(sb.ID()); err != nil && !errors.Is(err, storage.ErrContainerUnknown) { @@ -122,5 +102,5 @@ func (s *Server) stopPodSandbox(ctx context.Context, req *pb.StopPodSandboxReque log.Infof(ctx, "Stopped pod sandbox: %s", sb.ID()) sb.SetStopped(true) - return resp, nil + return nil } diff --git a/server/server.go b/server/server.go index 93c4f4f17db..394a6694d0d 100644 --- a/server/server.go +++ b/server/server.go @@ -17,6 +17,7 @@ import ( "time" "github.com/containers/image/v5/types" + storageTypes "github.com/containers/storage" "github.com/containers/storage/pkg/idtools" "github.com/cri-o/cri-o/internal/hostport" "github.com/cri-o/cri-o/internal/lib" @@ -25,6 +26,7 @@ import ( "github.com/cri-o/cri-o/internal/resourcestore" "github.com/cri-o/cri-o/internal/runtimehandlerhooks" "github.com/cri-o/cri-o/internal/storage" + "github.com/cri-o/cri-o/internal/version" libconfig "github.com/cri-o/cri-o/pkg/config" "github.com/cri-o/cri-o/server/metrics" "github.com/fsnotify/fsnotify" @@ -156,7 +158,12 @@ func (s *Server) getPortForward(req *pb.PortForwardRequest) (*pb.PortForwardResp return s.stream.streamServer.GetPortForward(req) } -func (s *Server) restore(ctx context.Context) { +// restore attempts to restore the sandboxes and containers. +// For every sandbox it fails to restore, it starts a cleanup routine attempting to call CNI DEL +// For every container it fails to restore, it returns that containers image, so that +// it can be cleaned up (if we're using internal_wipe). +func (s *Server) restore(ctx context.Context) []string { + containersAndTheirImages := map[string]string{} containers, err := s.Store().Containers() if err != nil && !errors.Is(err, os.ErrNotExist) { logrus.Warnf("could not read containers and sandboxes: %v", err) @@ -164,7 +171,7 @@ func (s *Server) restore(ctx context.Context) { pods := map[string]*storage.RuntimeContainerMetadata{} podContainers := map[string]*storage.RuntimeContainerMetadata{} names := map[string][]string{} - deletedPods := map[string]bool{} + deletedPods := map[string]*sandbox.Sandbox{} for i := range containers { metadata, err2 := s.StorageRuntimeServer().GetContainerMetadata(containers[i].ID) if err2 != nil { @@ -180,18 +187,20 @@ func (s *Server) restore(ctx context.Context) { pods[containers[i].ID] = &metadata } else { podContainers[containers[i].ID] = &metadata + containersAndTheirImages[containers[i].ID] = containers[i].ImageID } } // Go through all the pods and check if it can be restored. If an error occurs, delete the pod and any containers // associated with it. Release the pod and container names as well. - for sbID, metadata := range pods { - if err = s.LoadSandbox(sbID); err == nil { + for sbID := range pods { + sb, err := s.LoadSandbox(sbID) + if err == nil { continue } - logrus.Warnf("could not restore sandbox %s container %s: %v", metadata.PodID, sbID, err) + logrus.Warnf("could not restore sandbox %s; deleting it and containers underneath it: %v", sbID, err) for _, n := range names[sbID] { - if err := s.Store().DeleteContainer(n); err != nil { + if err := s.Store().DeleteContainer(n); err != nil && err != storageTypes.ErrNotAContainer { logrus.Warnf("unable to delete container %s: %v", n, err) } // Release the infra container name and the pod name for future use @@ -201,59 +210,85 @@ func (s *Server) restore(ctx context.Context) { s.ReleasePodName(n) } } - // Go through the containers and delete any container that was under the deleted pod - logrus.Warnf("deleting all containers under sandbox %s since it could not be restored", sbID) for k, v := range podContainers { - if v.PodID == sbID { - for _, n := range names[k] { - if err := s.Store().DeleteContainer(n); err != nil { - logrus.Warnf("unable to delete container %s: %v", n, err) - } - // Release the container name for future use - s.ReleaseContainerName(n) + if v.PodID != sbID { + continue + } + for _, n := range names[k] { + if err := s.Store().DeleteContainer(n); err != nil && err != storageTypes.ErrNotAContainer { + logrus.Warnf("unable to delete container %s: %v", n, err) } + // Release the container name for future use + s.ReleaseContainerName(n) } + // Remove the container from the list of podContainers, or else we'll retry the delete later, + // causing a useless debug message. + delete(podContainers, k) + } + // Add the pod id to the list of deletedPods, to be able to call CNI DEL on the sandbox network. + // Unfortunately, if we weren't able to restore a sandbox, then there's little that can be done + if sb != nil { + deletedPods[sbID] = sb } - // Add the pod id to the list of deletedPods so we don't try to restore IPs for it later on - deletedPods[sbID] = true } // Go through all the containers and check if it can be restored. If an error occurs, delete the conainer and // release the name associated with you. for containerID := range podContainers { - if err := s.LoadContainer(containerID); err != nil { - // containers of other runtimes should not be deleted - if err == lib.ErrIsNonCrioContainer { - logrus.Infof("ignoring non CRI-O container %s", containerID) - } else { - logrus.Warnf("could not restore container %s: %v", containerID, err) - for _, n := range names[containerID] { - if err := s.Store().DeleteContainer(n); err != nil { - logrus.Warnf("unable to delete container %s: %v", n, err) - } - // Release the container name - s.ReleaseContainerName(n) - } + err := s.LoadContainer(containerID) + if err == nil || err == lib.ErrIsNonCrioContainer { + delete(containersAndTheirImages, containerID) + continue + } + logrus.Warnf("Could not restore container %s: %v", containerID, err) + for _, n := range names[containerID] { + if err := s.Store().DeleteContainer(n); err != nil && err != storageTypes.ErrNotAContainer { + logrus.Warnf("Unable to delete container %s: %v", n, err) } + // Release the container name + s.ReleaseContainerName(n) } } - // Restore sandbox IPs - for _, sb := range s.ListSandboxes() { - // Clean up networking if pod couldn't be restored and was deleted - if ok := deletedPods[sb.ID()]; ok { - if err := s.networkStop(ctx, sb); err != nil { - logrus.Warnf("error stopping network on restore cleanup %v:", err) + // Cleanup the deletedPods in the networking plugin + wipeResourceCleaner := resourcestore.NewResourceCleaner() + for _, sb := range deletedPods { + sb := sb + cleanupFunc := func() error { + err := s.networkStop(context.Background(), sb) + if err == nil { + logrus.Infof("Successfully cleaned up network for pod %s", sb.ID()) } - continue + return err + } + wipeResourceCleaner.Add(ctx, "cleanup sandbox network", cleanupFunc) + } + + // If any failed to be deleted, the networking plugin is likely not ready. + // The cleanup should be retried until it succeeds. + go func() { + if err := wipeResourceCleaner.Cleanup(); err != nil { + logrus.Errorf("Cleanup during server startup failed: %v", err) } + }() + + // Restore sandbox IPs + for _, sb := range s.ListSandboxes() { ips, err := s.getSandboxIPs(sb) if err != nil { - logrus.Warnf("could not restore sandbox IP for %v: %v", sb.ID(), err) + logrus.Warnf("Could not restore sandbox IP for %v: %v", sb.ID(), err) continue } sb.AddIPs(ips) } + + // Return a slice of images to remove, if internal_wipe is set. + imagesOfDeletedContainers := []string{} + for _, image := range containersAndTheirImages { + imagesOfDeletedContainers = append(imagesOfDeletedContainers, image) + } + + return imagesOfDeletedContainers } // cleanupSandboxesOnShutdown Remove all running Sandboxes on system shutdown @@ -372,8 +407,9 @@ func New( return nil, err } - s.restore(ctx) + deletedImages := s.restore(ctx) s.cleanupSandboxesOnShutdown(ctx) + s.wipeIfAppropriate(ctx, deletedImages) var bindAddressStr string bindAddress := net.ParseIP(config.StreamAddress) @@ -443,6 +479,31 @@ func New( return s, nil } +// wipeIfAppropriate takes a list of images. If the config's VersionFilePersist +// indicates an upgrade has happened, it attempts to wipe that list of images. +// This attempt is best-effort. +func (s *Server) wipeIfAppropriate(ctx context.Context, imagesToDelete []string) { + if !s.config.InternalWipe { + return + } + // Check if our persistent version file is out of date. + // If so, we have upgrade, and we should wipe images. + shouldWipeImages, err := version.ShouldCrioWipe(s.config.VersionFilePersist) + if err != nil { + logrus.Warnf("error encountered when checking whether cri-o should wipe images: %v", err) + } + // Note: some of these will fail if some aspect of the pod cleanup failed as well, + // but this is best-effort anyway, as the Kubelet will eventually cleanup images when + // disk usage gets too high. + if shouldWipeImages { + for _, img := range imagesToDelete { + if err := s.removeImage(ctx, img); err != nil { + logrus.Warnf("failed to remove image %s: %v", img, err) + } + } + } +} + func (s *Server) addSandbox(sb *sandbox.Sandbox) error { return s.ContainerServer.AddSandbox(sb) } diff --git a/server/utils.go b/server/utils.go index 0eac70c4605..e57b282c88a 100644 --- a/server/utils.go +++ b/server/utils.go @@ -6,7 +6,6 @@ import ( "fmt" "io" "io/ioutil" - "math" "os" "path/filepath" "strings" @@ -15,15 +14,12 @@ import ( encconfig "github.com/containers/ocicrypt/config" cryptUtils "github.com/containers/ocicrypt/utils" "github.com/containers/storage/pkg/mount" - "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" - "github.com/cri-o/ocicni/pkg/ocicni" v1 "github.com/opencontainers/image-spec/specs-go/v1" "github.com/opencontainers/runtime-tools/validate" "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/syndtr/gocapability/capability" - "k8s.io/apimachinery/pkg/api/resource" pb "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -107,54 +103,6 @@ func parseDNSOptions(servers, searches, options []string, path string) error { return nil } -func (s *Server) newPodNetwork(sb *sandbox.Sandbox) (ocicni.PodNetwork, error) { - var egress, ingress int64 = 0, 0 - - if val, ok := sb.Annotations()["kubernetes.io/egress-bandwidth"]; ok { - egressQ, err := resource.ParseQuantity(val) - if err != nil { - return ocicni.PodNetwork{}, fmt.Errorf("failed to parse egress bandwidth: %v", err) - } else if iegress, isok := egressQ.AsInt64(); isok { - egress = iegress - } - } - if val, ok := sb.Annotations()["kubernetes.io/ingress-bandwidth"]; ok { - ingressQ, err := resource.ParseQuantity(val) - if err != nil { - return ocicni.PodNetwork{}, fmt.Errorf("failed to parse ingress bandwidth: %v", err) - } else if iingress, isok := ingressQ.AsInt64(); isok { - ingress = iingress - } - } - - var bwConfig *ocicni.BandwidthConfig - - if ingress > 0 || egress > 0 { - bwConfig = &ocicni.BandwidthConfig{} - if ingress > 0 { - bwConfig.IngressRate = uint64(ingress) - bwConfig.IngressBurst = math.MaxUint32 * 8 // 4GB burst limit - } - if egress > 0 { - bwConfig.EgressRate = uint64(egress) - bwConfig.EgressBurst = math.MaxUint32 * 8 // 4GB burst limit - } - } - - network := s.config.CNIPlugin().GetDefaultNetworkName() - return ocicni.PodNetwork{ - Name: sb.KubeName(), - Namespace: sb.Namespace(), - UID: sb.Metadata().UID, - Networks: []ocicni.NetAttachment{}, - ID: sb.ID(), - NetNS: sb.NetNsPath(), - RuntimeConfig: map[string]ocicni.RuntimeConfig{ - network: {Bandwidth: bwConfig}, - }, - }, nil -} - // inStringSlice checks whether a string is inside a string slice. // Comparison is case insensitive. func inStringSlice(ss []string, str string) bool { diff --git a/test/crio-wipe.bats b/test/crio-wipe.bats index 271735c0e5d..fe35eeceda0 100644 --- a/test/crio-wipe.bats +++ b/test/crio-wipe.bats @@ -10,6 +10,8 @@ function setup() { setup_test export CONTAINER_VERSION_FILE="$TESTDIR"/version.tmp export CONTAINER_VERSION_FILE_PERSIST="$TESTDIR"/version-persist.tmp + CONTAINER_NAMESPACES_DIR=$(mktemp -d) + export CONTAINER_NAMESPACES_DIR } function run_podman_with_args() { @@ -22,6 +24,7 @@ function teardown() { cleanup_test run_podman_with_args stop -a run_podman_with_args rm -fa + cleanup_namespaces_dir } # run crio_wipe calls crio_wipe and tests it succeeded @@ -47,20 +50,31 @@ function test_crio_wiped_images() { # check that the pause image was removed, as we removed a pod # that used it output=$(crictl images) - [[ ! "$output" == *"pause"* ]] + [[ ! "$output" == *"$IMAGE_USED"* ]] } function test_crio_did_not_wipe_images() { # check that the pause image was not removed output=$(crictl images) - [[ "$output" == *"pause"* ]] + [[ "$output" == *"$IMAGE_USED"* ]] +} + +# simulate a reboot by unmounting and removing the namespaces +function cleanup_namespaces_dir() { + find "$CONTAINER_NAMESPACES_DIR" -type f -exec umount {} \; + rm -fr "$CONTAINER_NAMESPACES_DIR" } function start_crio_with_stopped_pod() { start_crio + # it must be everything before the tag, because crictl output won't match (the columns for image and tag are separated by space) + IMAGE_USED=$(jq -r .image.image < "$TESTDATA"/container_config.json | cut -f1 -d ':') + local pod_id pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json) + crictl start "$ctr_id" crictl stopp "$pod_id" } @@ -115,3 +129,104 @@ function start_crio_with_stopped_pod() { run_podman_with_args ps -a | grep test } + +@test "internal_wipe remove containers and images when remove both" { + # simulate a reboot by having a removable namespaces dir + start_crio_with_stopped_pod + stop_crio_no_clean + + rm "$CONTAINER_VERSION_FILE" + rm "$CONTAINER_VERSION_FILE_PERSIST" + # simulate a reboot by having a removable namespaces dir + cleanup_namespaces_dir + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + test_crio_wiped_containers + test_crio_wiped_images +} + +@test "internal_wipe remove containers when remove temporary" { + start_crio_with_stopped_pod + stop_crio_no_clean + + rm "$CONTAINER_VERSION_FILE" + # simulate a reboot by having a removable namespaces dir + cleanup_namespaces_dir + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + test_crio_wiped_containers + test_crio_did_not_wipe_images +} + +@test "internal_wipe clear both when remove persist" { + start_crio_with_stopped_pod + stop_crio_no_clean + + rm "$CONTAINER_VERSION_FILE_PERSIST" + # simulate a reboot by having a removable namespaces dir + cleanup_namespaces_dir + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + test_crio_wiped_containers + test_crio_wiped_images +} + +@test "internal_wipe don't clear podman containers" { + if [ -z "$PODMAN_BINARY" ]; then + skip "Podman not installed" + fi + + start_crio_with_stopped_pod + stop_crio_no_clean + + run_podman_with_args run --name test -d quay.io/crio/busybox:latest top + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + + run_podman_with_args ps -a | grep test +} + +@test "internal_wipe don't clear containers on a forced restart of crio" { + start_crio_with_stopped_pod + stop_crio_no_clean "-9" || true + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + + test_crio_did_not_wipe_containers + test_crio_did_not_wipe_images +} + +@test "internal_wipe eventually cleans network on forced restart of crio if network is slow to come up" { + CNI_RESULTS_DIR=/var/lib/cni/results + + start_crio + + pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json) + crictl start "$ctr_id" + + stop_crio_no_clean + + runtime kill "$ctr_id" || true + runtime kill "$pod_id" || true + # simulate a reboot by having a removable namespaces dir + cleanup_namespaces_dir + + # pretend like the CNI plugin is waiting for a container to start + mv "$CRIO_CNI_PLUGIN"/"$CNI_TYPE" "$CRIO_CNI_PLUGIN"/"$CNI_TYPE"-hidden + rm "$CONTAINER_VERSION_FILE" + + CONTAINER_INTERNAL_WIPE=true start_crio_no_setup + + # allow cri-o to catchup + sleep 5s + + # pretend like the CNI container has started + mv "$CRIO_CNI_PLUGIN"/"$CNI_TYPE"-hidden "$CRIO_CNI_PLUGIN"/"$CNI_TYPE" + + # allow cri-o to catch up + sleep 5s + + # make sure network resources were cleaned up + ! ls "$CNI_RESULTS_DIR"/*"$pod_id"* +} diff --git a/test/drop_infra.bats b/test/drop_infra.bats index 96b8bf96134..db01c950ed9 100644 --- a/test/drop_infra.bats +++ b/test/drop_infra.bats @@ -21,7 +21,7 @@ function teardown() { "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox_no_infra.json pod_id=$(crictl runp "$TESTDIR"/sandbox_no_infra.json) - output=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list) + output=$(runtime list) [[ ! "$output" = *"$pod_id"* ]] } @@ -30,6 +30,6 @@ function teardown() { "$TESTDATA"/sandbox_config.json > "$TESTDIR"/sandbox_no_infra.json pod_id=$(crictl runp "$TESTDIR"/sandbox_no_infra.json) - output=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list) + output=$(runtime list) [[ "$output" = *"$pod_id"* ]] } diff --git a/test/helpers.bash b/test/helpers.bash index 652730cebeb..74f6c79e98c 100644 --- a/test/helpers.bash +++ b/test/helpers.bash @@ -184,6 +184,11 @@ function crictl() { "$CRICTL_BINARY" -r "unix://$CRIO_SOCKET" -i "unix://$CRIO_SOCKET" "$@" } +# Run the runtime binary with the specified RUNTIME_ROOT +function runtime() { + "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" "$@" +} + # Communicate with Docker on the host machine. # Should rarely use this. function docker_host() { diff --git a/test/mocks/criostorage/criostorage.go b/test/mocks/criostorage/criostorage.go index 6cd828393ae..0a7d0406fe0 100644 --- a/test/mocks/criostorage/criostorage.go +++ b/test/mocks/criostorage/criostorage.go @@ -250,20 +250,6 @@ func (mr *MockRuntimeServerMockRecorder) GetWorkDir(arg0 interface{}) *gomock.Ca return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkDir", reflect.TypeOf((*MockRuntimeServer)(nil).GetWorkDir), arg0) } -// RemovePodSandbox mocks base method -func (m *MockRuntimeServer) RemovePodSandbox(arg0 string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "RemovePodSandbox", arg0) - ret0, _ := ret[0].(error) - return ret0 -} - -// RemovePodSandbox indicates an expected call of RemovePodSandbox -func (mr *MockRuntimeServerMockRecorder) RemovePodSandbox(arg0 interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePodSandbox", reflect.TypeOf((*MockRuntimeServer)(nil).RemovePodSandbox), arg0) -} - // SetContainerMetadata mocks base method func (m *MockRuntimeServer) SetContainerMetadata(arg0 string, arg1 *storage0.RuntimeContainerMetadata) error { m.ctrl.T.Helper() diff --git a/test/network.bats b/test/network.bats index 2318aa115d7..272d95b2dc8 100644 --- a/test/network.bats +++ b/test/network.bats @@ -1,5 +1,7 @@ #!/usr/bin/env bats +# vim:set ft=bash : + load helpers function setup() { @@ -141,3 +143,29 @@ function check_networking() { check_networking } + +@test "Clean up network if pod sandbox gets killed" { + start_crio + + CNI_RESULTS_DIR=/var/lib/cni/results + POD=$(crictl runp "$TESTDATA/sandbox_config.json") + + # CNI result is there + # shellcheck disable=SC2010 + [[ $(ls $CNI_RESULTS_DIR | grep "$POD") != "" ]] + + # kill the sandbox + runtime kill "$POD" KILL + + # wait for the pod to be killed + while crictl inspectp "$POD" | jq -e '.status.state != "SANDBOX_NOTREADY"' > /dev/null; do + echo Waiting for sandbox to be stopped + done + + # now remove the sandbox + crictl rmp "$POD" + + # CNI result is gone + # shellcheck disable=SC2010 + [[ $(ls $CNI_RESULTS_DIR | grep "$POD") == "" ]] +} diff --git a/test/restore.bats b/test/restore.bats index 57c16b49cd9..cffd461b101 100644 --- a/test/restore.bats +++ b/test/restore.bats @@ -62,7 +62,7 @@ function teardown() { stop_crio # simulate reboot with runc state going away - "$CONTAINER_RUNTIME" delete -f "$pod_id" + runtime delete -f "$pod_id" start_crio @@ -79,8 +79,8 @@ function teardown() { stop_crio # simulate reboot with runc state going away - "$CONTAINER_RUNTIME" delete -f "$pod_id" - "$CONTAINER_RUNTIME" delete -f "$ctr_id" + runtime delete -f "$pod_id" + runtime delete -f "$ctr_id" start_crio @@ -98,8 +98,8 @@ function teardown() { stop_crio # simulate reboot with runc state going away - "$CONTAINER_RUNTIME" delete -f "$pod_id" - "$CONTAINER_RUNTIME" delete -f "$ctr_id" + runtime delete -f "$pod_id" + runtime delete -f "$ctr_id" start_crio @@ -118,7 +118,7 @@ function teardown() { stop_crio # simulate reboot with runc state going away - "$CONTAINER_RUNTIME" delete -f "$pod_id" + runtime delete -f "$pod_id" start_crio @@ -141,8 +141,8 @@ function teardown() { stop_crio # simulate reboot with runc state going away - "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" delete -f "$pod_id" - "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" delete -f "$ctr_id" + runtime delete -f "$pod_id" + runtime delete -f "$ctr_id" start_crio output=$(crictl pods --quiet) @@ -171,8 +171,8 @@ function teardown() { stop_crio # simulate reboot with runtime state and config.json going away - "$CONTAINER_RUNTIME" delete -f "$pod_id" - "$CONTAINER_RUNTIME" delete -f "$ctr_id" + runtime delete -f "$pod_id" + runtime delete -f "$ctr_id" find "$TESTDIR"/ -name config.json -exec rm \{\} \; find "$TESTDIR"/ -name shm -exec umount -l \{\} \; diff --git a/test/timeout.bats b/test/timeout.bats index 309c4fdff37..bbdfe79b304 100644 --- a/test/timeout.bats +++ b/test/timeout.bats @@ -58,7 +58,7 @@ function wait_clean() { pods=$(crictl pods -q) [[ -z "$pods" ]] - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q) + created_ctr_id=$(runtime list -q) [ -n "$created_ctr_id" ] output=$(crictl runp "$TESTDATA"/sandbox_config.json) @@ -81,7 +81,7 @@ function wait_clean() { [[ -z "$ctrs" ]] # cri-o should have created a container - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep -v "$pod_id") + created_ctr_id=$(runtime list -q | grep -v "$pod_id") [ -n "$created_ctr_id" ] output=$(crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json) @@ -98,7 +98,7 @@ function wait_clean() { wait_create - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q) + created_ctr_id=$(runtime list -q) [ -n "$created_ctr_id" ] # we should create a new pod and not reuse the old one @@ -108,7 +108,7 @@ function wait_clean() { wait_clean # the old, timed out container should have been removed - ! "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep "$created_ctr_id" + ! runtime list -q | grep "$created_ctr_id" } @test "should clean up container after timeout if request changes" { @@ -123,7 +123,7 @@ function wait_clean() { wait_create # cri-o should have created a container - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep -v "$pod_id") + created_ctr_id=$(runtime list -q | grep -v "$pod_id") [ -n "$created_ctr_id" ] # should create a new container and not reuse the old one @@ -133,7 +133,7 @@ function wait_clean() { wait_clean # the old, timed out container should have been removed - ! "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep "$created_ctr_id" + ! runtime list -q | grep "$created_ctr_id" } @test "should clean up pod after timeout if not re-requested" { @@ -151,7 +151,7 @@ function wait_clean() { [[ -z "$pods" ]] # pod should have been cleaned up - [[ -z $("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q) ]] + [[ -z $(runtime list -q) ]] # we should recreate the pod and not reuse the old one crictl runp "$TESTDATA"/sandbox_config.json @@ -173,7 +173,7 @@ function wait_clean() { [[ -z "$ctrs" ]] # container should have been cleaned up - ! "$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep -v "$pod_id" + ! runtime list -q | grep -v "$pod_id" # we should recreate the container and not reuse the old one crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json @@ -193,7 +193,7 @@ function wait_clean() { wait_create # container should not have been cleaned up - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q) + created_ctr_id=$(runtime list -q) [ -n "$created_ctr_id" ] ! crictl create "$created_ctr_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json @@ -213,7 +213,7 @@ function wait_clean() { wait_create # cri-o should have created a container - created_ctr_id=$("$CONTAINER_RUNTIME" --root "$RUNTIME_ROOT" list -q | grep -v "$pod_id") + created_ctr_id=$(runtime list -q | grep -v "$pod_id") [ -n "$created_ctr_id" ] ! crictl start "$created_ctr_id"