diff --git a/.github/workflows/build-pr.yml b/.github/workflows/build-pr.yml deleted file mode 100644 index 2b4e96091..000000000 --- a/.github/workflows/build-pr.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: build-pr - -on: - pull_request: - branches: - - main - -jobs: - run: - runs-on: ubuntu-latest - - strategy: - matrix: - go: [1.19.x, 1.20.x] - - steps: - - name: checkout source code - uses: actions/checkout@v3 - - - name: setup go environment - uses: actions/setup-go@v4 - with: - go-version: ${{ matrix.go }} - - - name: create go.mod - run: | - # Fix for "cannot find main module" issue - go mod init github.com/opencontainers/runtime-spec - - go get -d ./schema/... - - - name: run golangci-lint - uses: golangci/golangci-lint-action@v3 - with: - version: v1.51.2 - args: --verbose - - - name: run tests - run: | - set -x - make install.tools - - make .govet - - make .gitvalidation - make docs - make -C schema test diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 83354d8d5..1274470e5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,6 +4,9 @@ on: push: branches: - main + pull_request: + branches: + - main jobs: run: @@ -11,14 +14,14 @@ jobs: strategy: matrix: - go: [1.19.x, 1.20.x] + go: [1.21.x, 1.22.x] steps: - name: checkout source code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: setup go environment - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: ${{ matrix.go }} @@ -30,9 +33,9 @@ jobs: go get -d ./schema/... - name: run golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v4 with: - version: v1.51.2 + version: v1.56.1 args: --verbose - name: run tests diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..fd0252ea5 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,14 @@ +name: Lint + +on: [push, pull_request] + +jobs: + check-format: + runs-on: ubuntu-24.04 + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Run make -C schema fmt + run: make -C schema fmt + - name: Check for changes + run: git diff --exit-code diff --git a/CODEOWNERS b/CODEOWNERS index ef276a976..b560fcab1 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @AkihiroSuda @crosbymichael @cyphar @dqminh @giuseppe @hqhq @kolyshkin @mrunalp @thaJeztah @tianon @vbatts @utam0k +* @AkihiroSuda @crosbymichael @cyphar @dqminh @giuseppe @hqhq @kolyshkin @mrunalp @thaJeztah @tianon @utam0k diff --git a/ChangeLog b/ChangeLog index c93790817..bf2807f3f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,55 @@ OpenContainers Specifications +Changes with v1.2.1: + + Additions: + + * zos updates (#1273) + * Add support for windows CPU affinity (#1258) + * specs-go: sync SCMP_ARCH_* constants with libseccomp main (#1229) + * Add CPU affinity to executed processes (#1253, #1261) + * config-linux: describe the format of cpus and mems (#1253) + + Minor fixes: + + * Fix description of errnoRet in Seccomp (#1277) + * config-linux: update for libseccomp v2.6.0 (#1276) + * Correct `prestart` hook description in summary (#1275) + + Documentation, CI & Governance: + + * ci: Add a github actions workflow for lint (#1257) + * update http links to https (#1269) + * doc: fix the invalid hyperlink naming-a-volume (#1268) + * ci: remove redundunt actions (#1256) + * chore: format JSON file `make -C schema fmt` (#1255) + * CODEOWNERS: remove vbatts (#1248) + * MAINTAINERS: move vbatts to EMERITUS (#1248) + * Update golangci-lint to v1.56.1 in CI (#1245) + * Add Go v1.21 and v1.22 to GitHub Actions CI matrix (#1245) + * Update GitHub Actions packages to resolve warnings in CI (#1244) + +Changes with v1.2.0: + + Additions: + + * config: add idmap and ridmap mount options (#1222) + * config.md: allow empty mappings for [r]idmap (#1224) + * features-linux: Expose idmap information (#1219) + * mount: Allow relative mount destinations on Linux (#1225) + * features: add potentiallyUnsafeConfigAnnotations (#1205) + * config: add support for org.opencontainers.image annotations #1197 + + Minor fixes: + + * config: improve bind mount and propagation doc (#1228) + + Documentation, CI & Governance: + + * fix link to hooks in features (#1226) + * specs-go: add missing deprecation comment for Hooks.Prestart (#1232) + * specs-go: mark LinuxMemory.Kernel as deprecated ()#1233) + Changes with v1.1.0: Breaking changes (but rather conforms to the existing runc implementation): diff --git a/EMERITUS.md b/EMERITUS.md new file mode 100644 index 000000000..0e234dad0 --- /dev/null +++ b/EMERITUS.md @@ -0,0 +1,12 @@ +# Emeritus + +We would like to acknowledge previous OCI runtime spec maintainers and their huge contributions to our collective success: + +- Rohit Jnagal (@rjnagal) +- Victor Marmol (@vmarmol) +- Alexander Morozov (@LK4D4) +- Vishnu Kannan (@vishh) +- Brandon Philips (@philips) +- Vincent Batts (@vbatts) + +We thank these members for their service to the OCI community. diff --git a/MAINTAINERS b/MAINTAINERS index 79decef3e..7424597ae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1,6 +1,5 @@ Michael Crosby (@crosbymichael) Mrunal Patel (@mrunalp) -Vincent Batts (@vbatts) Daniel, Dao Quang Minh (@dqminh) Tianon Gravi (@tianon) Qiang Huang (@hqhq) diff --git a/README.md b/README.md index a49599829..2a7fa5686 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ OCI discussion happens in the following chat rooms, which are all bridged togeth #### Sign your work The sign-off is a simple line at the end of the explanation for the patch, which certifies that you wrote it or otherwise have the right to pass it on as an open-source patch. -The rules are pretty simple: if you can certify the below (from http://developercertificate.org): +The rules are pretty simple: if you can certify the below (from https://developercertificate.org): ``` Developer Certificate of Origin @@ -143,12 +143,12 @@ Read more on [How to Write a Git Commit Message][how-to-git-commit] or the Discu [charter]: https://github.com/opencontainers/tob/blob/master/CHARTER.md [code-of-conduct]: https://github.com/opencontainers/org/blob/master/CODE_OF_CONDUCT.md [dev-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev -[how-to-git-commit]: http://chris.beams.io/posts/git-commit +[how-to-git-commit]: https://cbea.ms/git-commit/ [iso-week]: https://en.wikipedia.org/wiki/ISO_week_date#Calculating_the_week_number_of_a_given_date -[minutes]: http://ircbot.wl.linuxfoundation.org/meetings/opencontainers/ +[minutes]: https://ircbot.wl.linuxfoundation.org/meetings/opencontainers/ [oci]: https://www.opencontainers.org [rfc5545]: https://tools.ietf.org/html/rfc5545 [runtime-wiki]: https://github.com/opencontainers/runtime-spec/wiki [uberconference]: https://www.uberconference.com/opencontainers -[git-commit.1]: http://git-scm.com/docs/git-commit +[git-commit.1]: https://git-scm.com/docs/git-commit diff --git a/config-linux.md b/config-linux.md index f0261415e..4ce6b7088 100644 --- a/config-linux.md +++ b/config-linux.md @@ -189,6 +189,108 @@ In addition to any devices configured with this setting, the runtime MUST also s * [`/dev/ptmx`][pts.4]. A [bind-mount or symlink of the container's `/dev/pts/ptmx`][devpts]. +## Network Devices + +Linux network devices are entities that send and receive data packets. They are +not represented as files in the `/dev` directory. Instead, they are represented +by the [`net_device`][net_device] data structure in the Linux kernel. Network +devices can belong to only one network namespace and use a set of operations +distinct from regular file operations. Network devices can be categorized as +**physical** or **virtual**: + +* **Physical network devices** correspond to hardware interfaces, such as + Ethernet cards (e.g., `eth0`, `enp0s3`). They are directly associated with + physical network hardware. +* **Virtual network devices** are software-defined interfaces, such as loopback + devices (`lo`), virtual Ethernet pairs (`veth`), bridges (`br0`), VLANs, and + MACVLANs. They are created and managed by the kernel and do not correspond + to physical hardware. + +This schema focuses solely on moving existing network devices identified by name +from the host network namespace into the container network namespace. It does +not cover the complexities of network device creation or network configuration, +such as IP address assignment, routing, and DNS setup. + +**`netDevices`** (object, OPTIONAL) - A set of network devices that MUST be made +available in the container. The runtime is responsible for moving these devices; +the underlying mechanism is implementation-defined. + +The name of the network device is the entry key. Entry values are objects with +the following properties: + +* **`name`** *(string, OPTIONAL)* - the name of the network device inside the + container namespace. If not specified, the host name is used. + +The runtime MUST check if moving the network interface to the container +namespace is possible. If a network device with the specified name already +exists in the container namespace, the runtime MUST [generate an error](runtime.md#errors), +unless the user has provided a template by appending +`%d` to the new name. In that case, the runtime MUST allow the move, and the +kernel will generate a unique name for the interface within the container's +network namespace. + +The runtime MUST preserve existing network interface attributes, including all +permanent IP addresses (IFA_F_PERMANENT flag) of any family with global scope +(RT_SCOPE_UNIVERSE value) as defined in [`RFC 3549 Section 2.3.3.2`][rfc3549]. +This ensures that only addresses intended for persistent, external communication +are transferred. + +The runtime MUST set the network device state to "up" after moving it to the +network namespace to allow the container to send and receive network traffic +through that device. + +### Namespace Lifecycle and Container Termination + +The runtime MUST NOT actively manage the interface's lifecycle and configuration +*within* the container's network namespace. This is because network interfaces +are inherently tied to the network namespace itself, and their lifecycle is +therefore managed by the owner of the network namespace. Typically, this +ownership and management are handled by higher-level container runtime +orchestrators, rather than the processes running directly within the container. + +The runtime **MUST NOT** attempt to move the interface out of the namespace +before deletion. This design decision is based on the following: + +* **Namespace Ownership:** Network interfaces are tied to the network namespace, + which may not always be directly managed by the runtime. +* **Abrupt Termination:** Even when the runtime manages the namespace, it cannot + reliably participate in its deletion if the container's processes terminate + abruptly (e.g., due to a crash) or run until completion. + +During the network namespace deletion the kernel's built-in namespace cleanup +mechanisms take over, as described in [network_namespaces(7)][net_namespaces.7]: +"When a network namespace is freed (i.e., when the last process in the namespace +terminates), its physical network devices are moved back to the initial network +namespace." All the network namespace migratable physical network devices are +moved to the default network namespace, while virtual devices (veth, macvlan, +...) are destroyed. + +If users require custom handling of interface lifecycle during namespace +deletion, they can utilize existing features within the namespace orchestrator +or employ post-stop hooks. + +**Physical Interface Renaming and Systemd** + +When a physical interface is renamed within a container and the container's +network namespace is later deleted, the kernel will move the interface back to +the root namespace with its renamed name. In case of a name conflict in the root +namespace, the kernel will rename it to `dev%d`. To ensure predictable interface +names in the root namespace, users can utilize systemd's `udevd` and `networkd` +rules. Refer to [systemd Predictable Network Interface Names][predictable-network-interfaces-names] +for more information on configuring predictable names. + +### Example + +#### Moving a device with a renamed interface inside the container: + +```json +"netDevices": { + "eth0" : { + "name": "container_eth0" + } +} +``` + ## Control groups Also known as cgroups, they are used to restrict resource usage for a container and handle device access. @@ -395,8 +497,8 @@ The following parameters can be specified to set up the controller: * **`period`** *(uint64, OPTIONAL)* - specifies a period of time in microseconds for how regularly a cgroup's access to CPU resources should be reallocated (CFS scheduler only) * **`realtimeRuntime`** *(int64, OPTIONAL)* - specifies a period of time in microseconds for the longest continuous period in which the tasks in a cgroup have access to CPU resources * **`realtimePeriod`** *(uint64, OPTIONAL)* - same as **`period`** but applies to realtime scheduler only -* **`cpus`** *(string, OPTIONAL)* - list of CPUs the container will run in -* **`mems`** *(string, OPTIONAL)* - list of Memory Nodes the container will run in +* **`cpus`** *(string, OPTIONAL)* - list of CPUs the container will run on. This is a comma-separated list, with dashes to represent ranges. For example, `0-3,7` represents CPUs 0,1,2,3, and 7. +* **`mems`** *(string, OPTIONAL)* - list of memory nodes the container will run on. This is a comma-separated list, with dashes to represent ranges. For example, `0-3,7` represents memory nodes 0,1,2,3, and 7. * **`idle`** *(int64, OPTIONAL)* - cgroups are configured with minimum weight, 0: default behavior, 1: SCHED_IDLE. #### Example @@ -564,7 +666,9 @@ For more information, see the kernel cgroups documentation about [pids][cgroup-v The following parameters can be specified to set up the controller: -* **`limit`** *(int64, REQUIRED)* - specifies the maximum number of tasks in the cgroup +* **`limit`** *(int64, OPTIONAL)* - specifies the maximum number of tasks in the cgroup, with `-1` indicating no limit (`max`). + +> Note: Even though it may superficially seem redundant, `0` is a valid limit value for the `pids` cgroup controller from the kernel's perspective and SHOULD be treated as such by runtimes. #### Example @@ -640,11 +744,15 @@ If `intelRdt` is not set, the runtime MUST NOT manipulate any `resctrl` pseudo-f The following parameters can be specified for the container: * **`closID`** *(string, OPTIONAL)* - specifies the identity for RDT Class of Service (CLOS). + As a special case, value `/` means that the container MUST be assigned to the default CLOS (the + root of the resctrl filesystem). * **`l3CacheSchema`** *(string, OPTIONAL)* - specifies the schema for L3 cache id and capacity bitmask (CBM). The value SHOULD start with `L3:` and SHOULD NOT contain newlines. * **`memBwSchema`** *(string, OPTIONAL)* - specifies the schema of memory bandwidth per L3 cache id. The value MUST start with `MB:` and MUST NOT contain newlines. +* **`schemata`** *(array of strings, OPTIONAL)* - specifies the schemata to be written to the `schemata` file in resctrlfs. Each element represents one line in the `schemata` file. The value MUST NOT contain newlines. +* **`enableMonitoring`** *(boolean, OPTIONAL)* - enables resctrl monitoring for the container. The following rules on parameters MUST be applied: @@ -654,38 +762,95 @@ The following rules on parameters MUST be applied: * If either `l3CacheSchema` or `memBwSchema` is set, runtimes MUST write the value to the `schemata` file in the that sub-directory discussed in `closID`. -* If neither `l3CacheSchema` nor `memBwSchema` is set, runtimes MUST NOT write to `schemata` files in any `resctrl` pseudo-filesystems. +* If `schemata` field is set, runtimes MUST write the value to the `schemata` file in the that sub-directory discussed in `closID`. If also `l3CacheSchema` or `memBwSchema` is set the value of `schemata` field must be written last, after the values from `l3CacheSchema` and `memBwSchema` has been written. + +* If none of `l3CacheSchema`, `memBwSchema` or `schemata` is set, runtimes MUST NOT write to `schemata` files in any `resctrl` pseudo-filesystems. * If `closID` is not set, runtimes MUST use the container ID from [`start`](runtime.md#start) and create the `` directory. -* If `closID` is set, `l3CacheSchema` and/or `memBwSchema` is set +* If `closID` is set, `l3CacheSchema` and/or `memBwSchema` and/or `schemata` is set * if `closID` directory in a mounted `resctrl` pseudo-filesystem doesn't exist, the runtimes MUST create it. * if `closID` directory in a mounted `resctrl` pseudo-filesystem exists, runtimes MUST compare `l3CacheSchema` and/or `memBwSchema` value with `schemata` file, and [generate an error](runtime.md#errors) if doesn't match. -* If `closID` is set, and neither of `l3CacheSchema` and `memBwSchema` are set, runtime MUST check if corresponding pre-configured directory `closID` is present in mounted `resctrl`. If such pre-configured directory `closID` exists, runtime MUST assign container to this `closID` and [generate an error](runtime.md#errors) if directory does not exist. +* If `closID` is set, and none of `l3CacheSchema`, `memBwSchema` or `schemata` are set, runtime MUST check if corresponding pre-configured directory `closID` is present in mounted `resctrl`. If such pre-configured directory `closID` exists, runtime MUST assign container to this `closID` and [generate an error](runtime.md#errors) if directory does not exist. -* **`enableCMT`** *(boolean, OPTIONAL)* - specifies if Intel RDT CMT should be enabled: - * CMT (Cache Monitoring Technology) supports monitoring of the last-level cache (LLC) occupancy - for the container. +* If `enableMonitoring` is set, the runtime MUST create a dedicated MON group + for the container. The runtime MUST use the container ID from + [`start`](runtime.md#start) as the name of the MON group, i.e. create + `mon_groups//` subdirectory under the top-level CTRL_MON group + (named after `closID` or ``, see above). The runtime MUST + delete the MON group after the container is deleted. If creation of the MON + group fails (e.g. the maximum number of MON groups is reached) the runtime MUST + return an error. -* **`enableMBM`** *(boolean, OPTIONAL)* - specifies if Intel RDT MBM should be enabled: - * MBM (Memory Bandwidth Monitoring) supports monitoring of total and local memory bandwidth - for the container. +> **NOTE:** The `enableCMT` and `enableMBM` parameters, available in runtime-spec versions v1.1.0 through v1.2.1, were +> replaced with a unified `enableMonitoring` parameter in v1.3.0. Their semantics were loosely defined and there were +> no known implementations. More critically, these parameters were problematic as hardware does not support selective +> enabling of individual monitoring features. This scheme also made it unnecessarily complex to add support for new +> monitoring features, without providing any recognized benefits. ### Example -Consider a two-socket machine with two L3 caches where the default CBM is 0x7ff and the max CBM length is 11 bits, -and minimum memory bandwidth of 10% with a memory bandwidth granularity of 10%. +Consider a two-socket machine with: + +- two L3 caches where the default CBM is 0x7ff (11 bits) +- eight L2 caches where the default CBM is 0xFF (8 bits) +- minimum memory bandwidth of 10% with a memory bandwidth granularity of 10% -Tasks inside the container only have access to the "upper" 7/11 of L3 cache on socket 0 and the "lower" 5/11 L3 cache on socket 1, -and may use a maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. +Tasks inside the container: + +- have access to the "upper" 7/11 of L3 cache on socket 0 and the "lower" 5/11 L3 cache on socket 1 +- have access to the "lower" 4/8 of L2 cache on socket 0 (socket 1 is left out from this example) +- may use a maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. ```json "linux": { "intelRdt": { "closID": "guaranteed_group", - "l3CacheSchema": "L3:0=7f0;1=1f", - "memBwSchema": "MB:0=20;1=70" + "schemata": [ + "L3:0=7f0;1=1f", + "L2:0=f;1=f;2=f;3=f", + "MB:0=20;1=70" + ] + } +} +``` + +## Memory policy + +**`memoryPolicy`** (object, OPTIONAL) sets the NUMA memory policy for the container. +For more information see the [set_mempolicy(2)][set_mempolicy.2] man page. + +* **`mode`** *(string, REQUIRED)* - + + A valid list of constants is shown below. + + * `MPOL_DEFAULT` + * `MPOL_BIND` + * `MPOL_INTERLEAVE` + * `MPOL_WEIGHTED_INTERLEAVE` + * `MPOL_PREFERRED` + * `MPOL_PREFERRED_MANY` + * `MPOL_LOCAL` + +* **`nodes`** *(string, OPTIONAL)* - list of memory nodes from which nodemask is constructed to set_mempolicy(2). This is a comma-separated list, with dashes to represent ranges. For example, `0-3,7` represents memory nodes 0,1,2,3, and 7. Some modes require that there are no nodes, e.g. `MPOL_DEFAULT` and `MPOL_LOCAL`. Others that there is at least one node, e.g. `MPOL_BIND` and `MPOL_INTERLEAVE`. See set_mempolicy(2) for details. + +* **`flags`** *(array of strings, OPTIONAL)* - list of flags to use with set_mempolicy(2). + + A valid list of constants is shown below. + + * `MPOL_F_NUMA_BALANCING` + * `MPOL_F_RELATIVE_NODES` + * `MPOL_F_STATIC_NODES` + +### Example + +```json +"linux": { + "memoryPolicy": { + "mode": "MPOL_INTERLEAVE", + "nodes": "2-3" + "flags": ["MPOL_F_STATIC_NODES"], } } ``` @@ -719,9 +884,9 @@ The following parameters can be specified to set up seccomp: * **`defaultErrnoRet`** *(uint, OPTIONAL)* - the errno return code to use. Some actions like `SCMP_ACT_ERRNO` and `SCMP_ACT_TRACE` allow to specify the errno code to return. When the action doesn't support an errno, the runtime MUST print and error and fail. - If not specified then its default value is `EPERM`. + The default is `EPERM`. * **`architectures`** *(array of strings, OPTIONAL)* - the architecture used for system calls. - A valid list of constants as of libseccomp v2.5.0 is shown below. + A valid list of constants as of libseccomp v2.6.0 is shown below. * `SCMP_ARCH_X86` * `SCMP_ARCH_X86_64` @@ -742,6 +907,10 @@ The following parameters can be specified to set up seccomp: * `SCMP_ARCH_PARISC` * `SCMP_ARCH_PARISC64` * `SCMP_ARCH_RISCV64` + * `SCMP_ARCH_LOONGARCH64` + * `SCMP_ARCH_M68K` + * `SCMP_ARCH_SH` + * `SCMP_ARCH_SHEB` * **`flags`** *(array of strings, OPTIONAL)* - list of flags to use with seccomp(2). @@ -775,7 +944,7 @@ The following parameters can be specified to set up seccomp: * **`names`** *(array of strings, REQUIRED)* - the names of the syscalls. `names` MUST contain at least one entry. * **`action`** *(string, REQUIRED)* - the action for seccomp rules. - A valid list of constants as of libseccomp v2.5.0 is shown below. + A valid list of constants as of libseccomp v2.6.0 is shown below. * `SCMP_ACT_KILL` * `SCMP_ACT_KILL_PROCESS` @@ -790,7 +959,7 @@ The following parameters can be specified to set up seccomp: * **`errnoRet`** *(uint, OPTIONAL)* - the errno return code to use. Some actions like `SCMP_ACT_ERRNO` and `SCMP_ACT_TRACE` allow to specify the errno code to return. When the action doesn't support an errno, the runtime MUST print and error and fail. - If not specified its default value is `EPERM`. + The default is `EPERM`. * **`args`** *(array of objects, OPTIONAL)* - the specific syscall in seccomp. Each entry has the following structure: @@ -799,7 +968,7 @@ The following parameters can be specified to set up seccomp: * **`value`** *(uint64, REQUIRED)* - the value for syscall arguments in seccomp. * **`valueTwo`** *(uint64, OPTIONAL)* - the value for syscall arguments in seccomp. * **`op`** *(string, REQUIRED)* - the operator for syscall arguments in seccomp. - A valid list of constants as of libseccomp v2.3.2 is shown below. + A valid list of constants as of libseccomp v2.6.0 is shown below. * `SCMP_CMP_NE` * `SCMP_CMP_LT` @@ -959,7 +1128,7 @@ subset of the available options. [cgroup-v2-io]: https://docs.kernel.org/admin-guide/cgroup-v2.html#io [devices]: https://www.kernel.org/doc/Documentation/admin-guide/devices.txt [devpts]: https://www.kernel.org/doc/Documentation/filesystems/devpts.txt -[file]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_164 +[file]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_164 [libseccomp]: https://github.com/seccomp/libseccomp [proc]: https://www.kernel.org/doc/Documentation/filesystems/proc.txt [seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt @@ -967,17 +1136,22 @@ subset of the available options. [sysfs]: https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt [tmpfs]: https://www.kernel.org/doc/Documentation/filesystems/tmpfs.txt -[full.4]: http://man7.org/linux/man-pages/man4/full.4.html -[mknod.1]: http://man7.org/linux/man-pages/man1/mknod.1.html -[mknod.2]: http://man7.org/linux/man-pages/man2/mknod.2.html -[namespaces.7_2]: http://man7.org/linux/man-pages/man7/namespaces.7.html -[null.4]: http://man7.org/linux/man-pages/man4/null.4.html -[personality.2]: http://man7.org/linux/man-pages/man2/personality.2.html -[pts.4]: http://man7.org/linux/man-pages/man4/pts.4.html -[random.4]: http://man7.org/linux/man-pages/man4/random.4.html -[sysctl.8]: http://man7.org/linux/man-pages/man8/sysctl.8.html -[tty.4]: http://man7.org/linux/man-pages/man4/tty.4.html -[zero.4]: http://man7.org/linux/man-pages/man4/zero.4.html -[user-namespaces]: http://man7.org/linux/man-pages/man7/user_namespaces.7.html +[full.4]: https://man7.org/linux/man-pages/man4/full.4.html +[set_mempolicy.2]: https://man7.org/linux/man-pages/man2/set_mempolicy.2.html +[mknod.1]: https://man7.org/linux/man-pages/man1/mknod.1.html +[mknod.2]: https://man7.org/linux/man-pages/man2/mknod.2.html +[namespaces.7_2]: https://man7.org/linux/man-pages/man7/namespaces.7.html +[net_device]: https://docs.kernel.org/networking/netdevices.html +[net_namespaces.7]: https://man7.org/linux/man-pages/man7/network_namespaces.7.html +[predictable-network-interfaces-names]: https://systemd.io/PREDICTABLE_INTERFACE_NAMES +[rfc3549]: https://www.ietf.org/rfc/rfc3549.txt +[null.4]: https://man7.org/linux/man-pages/man4/null.4.html +[personality.2]: https://man7.org/linux/man-pages/man2/personality.2.html +[pts.4]: https://man7.org/linux/man-pages/man4/pts.4.html +[random.4]: https://man7.org/linux/man-pages/man4/random.4.html +[sysctl.8]: https://man7.org/linux/man-pages/man8/sysctl.8.html +[tty.4]: https://man7.org/linux/man-pages/man4/tty.4.html +[zero.4]: https://man7.org/linux/man-pages/man4/zero.4.html +[user-namespaces]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html [intel-rdt-cat-kernel-interface]: https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt [time_namespaces.7]: https://man7.org/linux/man-pages/man7/time_namespaces.7.html diff --git a/config-solaris.md b/config-solaris.md index ee375d62d..f50fed98c 100644 --- a/config-solaris.md +++ b/config-solaris.md @@ -115,6 +115,6 @@ Mapped to `lower-link` in the [zonecfg(1M)][zonecfg.1m_2] man page. ``` -[priv-str-to-set.3c]: http://docs.oracle.com/cd/E86824_01/html/E54766/priv-str-to-set-3c.html -[zoneadmd.1m]: http://docs.oracle.com/cd/E86824_01/html/E54764/zoneadmd-1m.html -[zonecfg.1m_2]: http://docs.oracle.com/cd/E86824_01/html/E54764/zonecfg-1m.html +[priv-str-to-set.3c]: https://docs.oracle.com/cd/E86824_01/html/E54766/priv-str-to-set-3c.html +[zoneadmd.1m]: https://docs.oracle.com/cd/E86824_01/html/E54764/zoneadmd-1m.html +[zonecfg.1m_2]: https://docs.oracle.com/cd/E86824_01/html/E54764/zonecfg-1m.html diff --git a/config-windows.md b/config-windows.md index 73a6d639e..037321a61 100644 --- a/config-windows.md +++ b/config-windows.md @@ -82,6 +82,14 @@ The following parameters can be specified (mutually exclusive): * **`count`** *(uint64, OPTIONAL)* - specifies the number of CPUs available to the container. It represents the fraction of the configured processor `count` in a container in relation to the processors available in the host. The fraction ultimately determines the portion of processor cycles that the threads in a container can use during each scheduling interval, as the number of cycles per 10,000 cycles. * **`shares`** *(uint16, OPTIONAL)* - limits the share of processor time given to the container relative to other workloads on the processor. The processor `shares` (`weight` at the platform level) is a value between 0 and 10,000. * **`maximum`** *(uint16, OPTIONAL)* - determines the portion of processor cycles that the threads in a container can use during each scheduling interval, as the number of cycles per 10,000 cycles. Set processor `maximum` to a percentage times 100. +* **`affinity`** *(array of objects, OPTIONAL)* - specifies the set of CPU to affinitize for this container. + + Each entry has the following structure: + + Ref: https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/miniport/ns-miniport-_group_affinity + + * **`mask`** *(uint64, REQUIRED)* - specifies the CPU mask relative to this CPU group. + * **`group`** *(uint32, REQUIRED)* - specifies the processor group this mask refers to, as returned by GetLogicalProcessorInformationEx. Ref: https://docs.microsoft.com/en-us/virtualization/api/hcs/schemareference#Container_Processor diff --git a/config-zos.md b/config-zos.md index b0fdc252c..a00e5a578 100644 --- a/config-zos.md +++ b/config-zos.md @@ -1,20 +1,56 @@ -_This document is a work in progress._ - # z/OS Container Configuration This document describes the schema for the [z/OS-specific section](config.md#platform-specific-configuration) of the [container configuration](config.md). +The z/OS container specification uses z/OS UNIX kernel features like namespaces and filesystem jails to fulfill the spec. + +Applications expecting a z/OS environment will very likely expect these file paths to be set up correctly. + +The following filesystems SHOULD be made available in each container's filesystem: + +| Path | Type | +| -------- | ------ | +| /proc | [proc][] | + +## Namespaces + +A namespace wraps a global system resource in an abstraction that makes it appear to the processes within the namespace that they have their own isolated instance of the global resource. +Changes to the global resource are visible to other processes that are members of the namespace, but are invisible to other processes. +For more information, see https://www.ibm.com/docs/zos/latest?topic=planning-namespaces-zos-unix. + +Namespaces are specified as an array of entries inside the `namespaces` root field. +The following parameters can be specified to set up namespaces: -## Devices +* **`type`** *(string, REQUIRED)* - namespace type. The following namespace types SHOULD be supported: + * **`pid`** processes inside the container will only be able to see other processes inside the same container or inside the same pid namespace. + * **`mount`** the container will have an isolated mount table. + * **`ipc`** processes inside the container will only be able to communicate to other processes inside the same container via system level IPC. + * **`uts`** the container will be able to have its own hostname and domain name. +* **`path`** *(string, OPTIONAL)* - namespace file. + This value MUST be an absolute path in the [runtime mount namespace](glossary.md#runtime-namespace). + The runtime MUST place the container process in the namespace associated with that `path`. + The runtime MUST [generate an error](runtime.md#errors) if `path` is not associated with a namespace of type `type`. -**`devices`** (array of objects, OPTIONAL) lists devices that MUST be available in the container. -The runtime MAY supply them however it likes. + If `path` is not specified, the runtime MUST create a new [container namespace](glossary.md#container-namespace) of type `type`. -Each entry has the following structure: +If a namespace type is not specified in the `namespaces` array, the container MUST inherit the [runtime namespace](glossary.md#runtime-namespace) of that type. +If a `namespaces` field contains duplicated namespaces with same `type`, the runtime MUST [generate an error](runtime.md#errors). -* **`type`** *(string, REQUIRED)* - type of device: `c`, `b`, `u` or `p`. -* **`path`** *(string, REQUIRED)* - full path to device inside container. - If a file already exists at `path` that does not match the requested device, the runtime MUST generate an error. -* **`major, minor`** *(int64, REQUIRED unless `type` is `p`)* - major, minor numbers for the device. -* **`fileMode`** *(uint32, OPTIONAL)* - file mode for the device. +### Example -The same `type`, `major` and `minor` SHOULD NOT be used for multiple devices. +```json +"namespaces": [ + { + "type": "pid", + "path": "/proc/1234/ns/pid" + }, + { + "type": "mount" + }, + { + "type": "ipc" + }, + { + "type": "uts" + } +] +``` diff --git a/config.md b/config.md index ffdae21ae..d642359d1 100644 --- a/config.md +++ b/config.md @@ -68,9 +68,14 @@ For Linux, the parameters are as documented in [mount(2)][mount.2] system call m For Solaris, the mount entry corresponds to the 'fs' resource in the [zonecfg(1M)][zonecfg.1m] man page. * **`destination`** (string, REQUIRED) Destination of mount point: path inside container. - This value MUST be an absolute path. - * Windows: one mount destination MUST NOT be nested within another mount (e.g., c:\\foo and c:\\foo\\bar). - * Solaris: corresponds to "dir" of the fs resource in [zonecfg(1M)][zonecfg.1m]. + * Linux: This value SHOULD be an absolute path. + For compatibility with old tools and configurations, it MAY be a relative path, in which case it MUST be interpreted as relative to "/". + Relative paths are **deprecated**. + * Windows: This value MUST be an absolute path. + One mount destination MUST NOT be nested within another mount (e.g., c:\\foo and c:\\foo\\bar). + * Solaris: This value MUST be an absolute path. + Corresponds to "dir" of the fs resource in [zonecfg(1M)][zonecfg.1m]. + * For all other platforms: This value MUST be an absolute path. * **`source`** (string, OPTIONAL) A device name, but can also be a file or directory name for bind mounts or a dummy. Path values for bind mounts are either absolute or relative to the bundle. A mount is a bind mount if it has either `bind` or `rbind` in the options. @@ -89,7 +94,7 @@ Runtimes MUST/SHOULD/MAY implement the following option strings for Linux: ------------------|-------------|----------------------------------------------------- `async` | MUST | [^1] `atime` | MUST | [^1] - `bind` | MUST | [^2] (bind mounts) + `bind` | MUST | Bind mount [^2] `defaults` | MUST | [^1] `dev` | MUST | [^1] `diratime` | MUST | [^1] @@ -110,9 +115,9 @@ Runtimes MUST/SHOULD/MAY implement the following option strings for Linux: `nostrictatime` | MUST | [^1] `nosuid` | MUST | [^1] `nosymfollow` | SHOULD | [^1] (Introduced in kernel 5.10, util-linux 2.38) - `private` | MUST | [^2] (bind mounts) + `private` | MUST | Bind mount propagation [^2] `ratime` | SHOULD | Recursive `atime` [^3] - `rbind` | MUST | [^2] (bind mounts) + `rbind` | MUST | Recursive bind mount [^2] `rdev` | SHOULD | Recursive `dev` [^3] `rdiratime` | SHOULD | Recursive `diratime` [^3] `relatime` | MUST | [^1] @@ -126,29 +131,31 @@ Runtimes MUST/SHOULD/MAY implement the following option strings for Linux: `rnosuid` | SHOULD | Recursive `nosuid` [^3] `rnosymfollow` | SHOULD | Recursive `nosymfollow` [^3] `ro` | MUST | [^1] - `rprivate` | MUST | [^2] (bind mounts) + `rprivate` | MUST | Bind mount propagation [^2] `rrelatime ` | SHOULD | Recursive `relatime` [^3] `rro` | SHOULD | Recursive `ro` [^3] `rrw` | SHOULD | Recursive `rw` [^3] - `rshared` | MUST | [^2] (bind mounts) - `rslave` | MUST | [^2] (bind mounts) + `rshared` | MUST | Bind mount propagation [^2] + `rslave` | MUST | Bind mount propagation [^2] `rstrictatime` | SHOULD | Recursive `strictatime` [^3] `rsuid` | SHOULD | Recursive `suid` [^3] `rsymfollow` | SHOULD | Recursive `symfollow` [^3] - `runbindable` | MUST | [^2] (bind mounts) + `runbindable` | MUST | Bind mount propagation [^2] `rw` | MUST | [^1] `shared` | MUST | [^1] `silent` | MUST | [^1] - `slave` | MUST | [^2] (bind mounts) + `slave` | MUST | Bind mount propagation [^2] `strictatime` | MUST | [^1] `suid` | MUST | [^1] `symfollow` | SHOULD | Opposite of `nosymfollow` `sync` | MUST | [^1] `tmpcopyup` | MAY | copy up the contents to a tmpfs - `unbindable` | MUST | [^2] (bind mounts) + `unbindable` | MUST | Bind mount propagation [^2] + `idmap` | SHOULD | Indicates that the mount MUST have an idmapping applied. This option SHOULD NOT be passed to the underlying [`mount(2)`][mount.2] call. If `uidMappings` or `gidMappings` are specified for the mount, the runtime MUST use those values for the mount's mapping. If they are not specified, the runtime MAY use the container's user namespace mapping, otherwise an [error MUST be returned](runtime.md#errors). If there are no `uidMappings` and `gidMappings` specified and the container isn't using user namespaces, an [error MUST be returned](runtime.md#errors). This SHOULD be implemented using [`mount_setattr(MOUNT_ATTR_IDMAP)`][mount_setattr.2], available since Linux 5.12. + `ridmap` | SHOULD | Indicates that the mount MUST have an idmapping applied, and the mapping is applied recursively [^3]. This option SHOULD NOT be passed to the underlying [`mount(2)`][mount.2] call. If `uidMappings` or `gidMappings` are specified for the mount, the runtime MUST use those values for the mount's mapping. If they are not specified, the runtime MAY use the container's user namespace mapping, otherwise an [error MUST be returned](runtime.md#errors). If there are no `uidMappings` and `gidMappings` specified and the container isn't using user namespaces, an [error MUST be returned](runtime.md#errors). This SHOULD be implemented using [`mount_setattr(MOUNT_ATTR_IDMAP)`][mount_setattr.2], available since Linux 5.12. [^1]: Corresponds to [`mount(8)` (filesystem-independent)][mount.8-filesystem-independent]. -[^2]: Corresponds to [`mount(8)` (filesystem-specific)][mount.8-filesystem-specific]. +[^2]: Corresponds to [bind mounts and shared subtrees][mount-bind]. [^3]: These `AT_RECURSIVE` options need kernel 5.12 or later. See [`mount_setattr(2)`][mount_setattr.2] The "MUST" options correspond to [`mount(8)`][mount.8]. @@ -156,7 +163,8 @@ The "MUST" options correspond to [`mount(8)`][mount.8]. Runtimes MAY also implement custom option strings that are not listed in the table above. If a custom option string is already recognized by [`mount(8)`][mount.8], the runtime SHOULD follow the behavior of [`mount(8)`][mount.8]. -Runtimes SHOULD pass unknown options to [`mount(2)`][mount.2] via the fifth argument (`const void *data`). +Runtimes SHOULD treat unknown options as [filesystem-specific ones][mount.8-filesystem-specific]) +and pass those as a comma-separated string to the fifth (`const void *data`) argument of [`mount(2)`][mount.2]. ### Example (Windows) @@ -177,10 +185,16 @@ For POSIX platforms the `mounts` structure has the following fields: * **`type`** (string, OPTIONAL) The type of the filesystem to be mounted. * Linux: filesystem types supported by the kernel as listed in */proc/filesystems* (e.g., "minix", "ext2", "ext3", "jfs", "xfs", "reiserfs", "msdos", "proc", "nfs", "iso9660"). For bind mounts (when `options` include either `bind` or `rbind`), the type is a dummy, often "none" (not listed in */proc/filesystems*). * Solaris: corresponds to "type" of the fs resource in [zonecfg(1M)][zonecfg.1m]. -* **`uidMappings`** (array of type LinuxIDMapping, OPTIONAL) The mapping to convert UIDs from the source file system to the destination mount point.\ -The format is the same as [user namespace mappings](config-linux.md#user-namespace-mappings). +* **`uidMappings`** (array of type LinuxIDMapping, OPTIONAL) The mapping to convert UIDs from the source file system to the destination mount point. + This SHOULD be implemented using [`mount_setattr(MOUNT_ATTR_IDMAP)`][mount_setattr.2], available since Linux 5.12. + If specified, the `options` field of the `mounts` structure SHOULD contain either `idmap` or `ridmap` to specify whether the mapping should be applied recursively for `rbind` mounts, as well as to ensure that older runtimes will not silently ignore this field. + The format is the same as [user namespace mappings](config-linux.md#user-namespace-mappings). + If specified, it MUST be specified along with `gidMappings`. * **`gidMappings`** (array of type LinuxIDMapping, OPTIONAL) The mapping to convert GIDs from the source file system to the destination mount point. -For more details see `uidMappings`. + This SHOULD be implemented using [`mount_setattr(MOUNT_ATTR_IDMAP)`][mount_setattr.2], available since Linux 5.12. + If specified, the `options` field of the `mounts` structure SHOULD contain either `idmap` or `ridmap` to specify whether the mapping should be applied recursively for `rbind` mounts, as well as to ensure that older runtimes will not silently ignore this field. + For more details see `uidMappings`. + If specified, it MUST be specified along with `uidMappings`. ### Example (Linux) @@ -290,7 +304,7 @@ For Linux-based systems, the `process` object supports the following process-spe If `oomScoreAdj` is not set, the runtime MUST NOT change the value of `oom_score_adj`. This is a per-process setting, where as [`disableOOMKiller`](config-linux.md#memory) is scoped for a memory cgroup. - For more information on how these two settings work together, see [the memory cgroup documentation section 10. OOM Contol][cgroup-v1-memory_2]. + For more information on how these two settings work together, see [the memory cgroup documentation section 10. OOM Control][cgroup-v1-memory_2]. * **`scheduler`** (object, OPTIONAL) is an object describing the scheduler properties for the process. The `scheduler` contains the following properties: * **`policy`** (string, REQUIRED) represents the scheduling policy. A valid list of values is: @@ -326,6 +340,24 @@ For Linux-based systems, the `process` object supports the following process-spe * **`class`** (string, REQUIRED) specifies the I/O scheduling class. Possible values are `IOPRIO_CLASS_RT`, `IOPRIO_CLASS_BE`, and `IOPRIO_CLASS_IDLE`. * **`priority`** (int, REQUIRED) specifies the priority level within the class. The value should be an integer ranging from 0 (highest) to 7 (lowest). +* **`execCPUAffinity`** (object, OPTIONAL) specifies CPU affinity used to execute the process. + This setting is not applicable to the container's init process. + The following properties are available: + * **`initial`** (string, OPTIONAL) is a list of CPUs a runtime parent + process to be run on initially, before the transition to container's + cgroup. This is a a comma-separated list, with dashes to represent + ranges. For example, `0-3,7` represents CPUs 0,1,2,3, and 7. + * **`final`** (string, OPTIONAL) is a list of CPUs the process will be run + on after the transition to container's cgroup. The format is the same as + for `initial`. If omitted or empty, runtime SHOULD NOT change process' + CPU affinity after the process is moved to container's cgroup, and the + final affinity is determined by the Linux kernel. + +### z/OS Process + +For z/OS-based systems, the `process` object supports the following process-specific properties. + +* **`noNewPrivileges`** (bool, OPTIONAL) setting `noNewPrivileges` to true prevents the process from gaining additional privileges. ### User @@ -402,7 +434,11 @@ _Note: symbolic name for uid and gid, such as uname and gname respectively, are "hard": 1024, "soft": 1024 } - ] + ], + "execCPUAffinity": { + "initial": "7", + "final": "0-3,7" + } } ``` ### Example (Solaris) @@ -617,7 +653,7 @@ See the below table for a summary of hooks and when they are called: | Name | Namespace | When | | ----------------------- | --------- | -----------------------------------------------------------------------------------------------------------------------------------| -| `prestart` (Deprecated) | runtime | After the start operation is called but before the user-specified program command is executed. | +| `prestart` (Deprecated) | runtime | During the create operation, after the runtime environment has been created and before the pivot root or any equivalent operation. | | `createRuntime` | runtime | During the create operation, after the runtime environment has been created and before the pivot root or any equivalent operation. | | `createContainer` | container | During the create operation, after the runtime environment has been created and before the pivot root or any equivalent operation. | | `startContainer` | container | After the start operation is called but before the user-specified program command is executed. | @@ -685,7 +721,21 @@ If there are no annotations then this property MAY either be absent or an empty Keys MUST be strings. Keys MUST NOT be an empty string. Keys SHOULD be named using a reverse domain notation - e.g. `com.example.myKey`. -Keys using the `org.opencontainers` namespace are reserved and MUST NOT be used by subsequent specifications. + +The `org.opencontainers` namespace for keys is reserved for use by this specification, annotations using keys in this namespace MUST be as described in this section. +The following keys in the `org.opencontainers` namespaces MAY be used: +| Key | Definition | +| --------------------------------------- | -----------------------------------------------------------------------------------------------------------------------------------| +| `org.opencontainers.image.os` | Indicates the operating system the container image was built to run on. The annotation value MUST have a valid value for the `os` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.os.version` | Indicates the operating system version targeted by the container image. The annotation value MUST have a valid value for the `os.version` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.os.features` | Indicates mandatory operating system features required by the container image. The annotation value MUST have a valid value for the `os.features` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.architecture` | Indicates the architecture that binaries in the container image are built to run on. The annotation value MUST have a valid value for the `architecture` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.variant` | Indicates the variant of the architecture that binaries in the container image are built to run on. The annotation value MUST have a valid value for the `variant` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.author` | Indicates the author of the container image. The annotation value MUST have a valid value for the `author` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.created` | Indicates the date and time when the container image was created. The annotation value MUST have a valid value for the `created` property as defined in [the OCIimage specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | +| `org.opencontainers.image.stopSignal` | Indicates signal that SHOULD be sent by the container runtimes to [kill the container](runtime.md#kill). The annotation value MUST have a valid value for the `config.StopSignal` property as defined in [the OCI image specification][oci-image-config-properties]. This annotation SHOULD only be used in accordance with the [OCI image specification's runtime conversion specification][oci-image-conversion]. | + +All other keys in the `org.opencontainers` namespace not specified in this above table are reserved and MUST NOT be used by subsequent specifications. Runtimes MUST handle unknown annotation keys like any other [unknown property](#extensibility). Values MUST be strings. @@ -1107,23 +1157,26 @@ Here is a full example `config.json` for reference. [apparmor]: https://wiki.ubuntu.com/AppArmor [cgroup-v1-memory_2]: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt -[selinux]:http://selinuxproject.org/page/Main_Page +[selinux]:https://selinuxproject.org/page/Main_Page [no-new-privs]: https://www.kernel.org/doc/Documentation/prctl/no_new_privs.txt [proc_2]: https://www.kernel.org/doc/Documentation/filesystems/proc.txt -[umask.2]: http://pubs.opengroup.org/onlinepubs/009695399/functions/umask.html -[semver-v2.0.0]: http://semver.org/spec/v2.0.0.html -[ieee-1003.1-2008-xbd-c8.1]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_01 -[ieee-1003.1-2008-functions-exec]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html -[naming-a-volume]: https://aka.ms/nb3hqb - -[capabilities.7]: http://man7.org/linux/man-pages/man7/capabilities.7.html -[mount.2]: http://man7.org/linux/man-pages/man2/mount.2.html -[mount.8]: http://man7.org/linux/man-pages/man8/mount.8.html -[mount.8-filesystem-independent]: http://man7.org/linux/man-pages/man8/mount.8.html#FILESYSTEM-INDEPENDENT_MOUNT_OPTIONS -[mount.8-filesystem-specific]: http://man7.org/linux/man-pages/man8/mount.8.html#FILESYSTEM-SPECIFIC_MOUNT_OPTIONS -[mount_setattr.2]: http://man7.org/linux/man-pages/man2/mount_setattr.2.html -[getrlimit.2]: http://man7.org/linux/man-pages/man2/getrlimit.2.html -[getrlimit.3]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/getrlimit.html -[stdin.3]: http://man7.org/linux/man-pages/man3/stdin.3.html -[uts-namespace.7]: http://man7.org/linux/man-pages/man7/namespaces.7.html -[zonecfg.1m]: http://docs.oracle.com/cd/E86824_01/html/E54764/zonecfg-1m.html +[umask.2]: https://pubs.opengroup.org/onlinepubs/009695399/functions/umask.html +[semver-v2.0.0]: https://semver.org/spec/v2.0.0.html +[ieee-1003.1-2008-xbd-c8.1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_01 +[ieee-1003.1-2008-functions-exec]: https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html +[naming-a-volume]: https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-volume +[oci-image-config-properties]: https://github.com/opencontainers/image-spec/blob/v1.1.0-rc2/config.md#properties +[oci-image-conversion]: https://github.com/opencontainers/image-spec/blob/v1.1.0-rc2/conversion.md + +[capabilities.7]: https://man7.org/linux/man-pages/man7/capabilities.7.html +[mount.2]: https://man7.org/linux/man-pages/man2/mount.2.html +[mount.8]: https://man7.org/linux/man-pages/man8/mount.8.html +[mount.8-filesystem-independent]: https://man7.org/linux/man-pages/man8/mount.8.html#FILESYSTEM-INDEPENDENT_MOUNT_OPTIONS +[mount.8-filesystem-specific]: https://man7.org/linux/man-pages/man8/mount.8.html#FILESYSTEM-SPECIFIC_MOUNT_OPTIONS +[mount_setattr.2]: https://man7.org/linux/man-pages/man2/mount_setattr.2.html +[mount-bind]: https://docs.kernel.org/filesystems/sharedsubtree.html +[getrlimit.2]: https://man7.org/linux/man-pages/man2/getrlimit.2.html +[getrlimit.3]: https://pubs.opengroup.org/onlinepubs/9699919799/functions/getrlimit.html +[stdin.3]: https://man7.org/linux/man-pages/man3/stdin.3.html +[uts-namespace.7]: https://man7.org/linux/man-pages/man7/namespaces.7.html +[zonecfg.1m]: https://docs.oracle.com/cd/E86824_01/html/E54764/zonecfg-1m.html diff --git a/features-linux.md b/features-linux.md index 452514387..3331266dd 100644 --- a/features-linux.md +++ b/features-linux.md @@ -195,17 +195,87 @@ Irrelevant to the availability of SELinux on the host operating system. } ``` +## MemoryPolicy + +**`memoryPolicy`** (object, OPTIONAL) represents the runtime's implementation status of memoryPolicy. + +* **`modes`** (array of strings, OPTIONAL). Recognized memory policies. Includes policies that may not be supported by the host operating system. + The runtime MUST recognize the elements in this array as the [`mode` of `linux.memoryPolicy` objects in `config.json`](config-linux.md#memory-policy). + +* **`flags`** (array of strings, OPTIONAL). Recognized flags for memory policies. Includes flags that may not be supported by the host operating system. + The runtime MUST recognize the elements in this in the [`flags` property of the `linux.memoryPolicy` object in `config.json`](config-linux.md#memory-policy) + +### Example + +```json +"memoryPolicy": { + "modes": [ + "MPOL_DEFAULT", + "MPOL_BIND", + "MPOL_INTERLEAVE", + "MPOL_WEIGHTED_INTERLEAVE", + "MPOL_PREFERRED", + "MPOL_PREFERRED_MANY", + "MPOL_LOCAL" + ], + "flags": [ + "MPOL_F_NUMA_BALANCING", + "MPOL_F_RELATIVE_NODES", + "MPOL_F_STATIC_NODES" + ] +} +``` + ## Intel RDT **`intelRdt`** (object, OPTIONAL) represents the runtime's implementation status of Intel RDT. Irrelevant to the availability of Intel RDT on the host operating system. * **`enabled`** (bool, OPTIONAL) represents whether the runtime supports Intel RDT. +* **`schemata`** (bool, OPTIONAL) represents whether the + (`schemata` field of `linux.intelRdt` in `config.json`)[config-linux.md#intelrdt] is supported. +* **`monitoring`** (bool, OPTIONAL) represents whether the + (`enableMonitoring` field of `linux.intelRdt` in `config.json`)[config-linux.md#intelrdt] is supported. ### Example ```json "intelRdt": { + "enabled": true, + "schemata": true, + "monitoring": true +} +``` + +## MountExtensions + +**`mountExtensions`** (object, OPTIONAL) represents whether the runtime supports certain mount features, irrespective of the availability of the features on the host operating system. + +* **`idmap`** (object, OPTIONAL) represents whether the runtime supports idmap mounts using the `uidMappings` and `gidMappings` properties of the mount. + * **`enabled`** (bool, OPTIONAL) represents whether the runtime parses and attempts to use the `uidMappings` and `gidMappings` properties of mounts if provided. + Note that it is possible for runtimes to have partial implementations of id-mapped mounts support (such as only allowing mounts which have mappings matching the container's user namespace, or only allowing the id-mapped bind-mounts). + In such cases, runtimes MUST still set this value to `true`, to indicate that the runtime recognises the `uidMappings` and `gidMappings` properties. + +### Example + +```json +"mountExtensions": { + "idmap":{ + "enabled": true + } +} +``` + +## NetDevices + +**`netDevices`** (object, OPTIONAL) represents the runtime's implementation status of Linux network devices. + +* **`enabled`** (bool, OPTIONAL) represents whether the runtime supports the capability to move Linux network devices into the container's network namespace. + +### Example + +```json +"netDevices": { "enabled": true } ``` diff --git a/features.md b/features.md index f3c8b5b3f..9c729591a 100644 --- a/features.md +++ b/features.md @@ -28,8 +28,8 @@ The `null` value MUST NOT be confused with an empty value such as `0`, `false`, ``` ## Hooks -* **`hooks`** (array of strings, OPTIONAL) The recognized names of the [hooks](config.md#hooks). - The runtime MUST support the elements in this array as the [`hooks` property of `config.json`](config.md#hooks). +* **`hooks`** (array of strings, OPTIONAL) The recognized names of the [hooks](config.md#posix-platform-hooks). + The runtime MUST support the elements in this array as the [`hooks` property of `config.json`](config.md#posix-platform-hooks). ### Example ```json @@ -140,6 +140,24 @@ The current version of the spec do not provide a way to enumerate the possible v } ``` +## Unsafe annotations in `config.json` + +**`potentiallyUnsafeConfigAnnotations`** (array of strings, OPTIONAL) contains values of [`annotations` property of `config.json`](config.md#annotations) +that may potentially change the behavior of the runtime. + +A value that ends with "." is interpreted as a prefix of annotations. + +### Example +```json +"potentiallyUnsafeConfigAnnotations": [ + "com.example.foo.bar", + "org.systemd.property." +] +``` + +The example above matches `com.example.foo.bar`, `org.systemd.property.ExecStartPre`, etc. +The example does not match `com.example.foo.bar.baz`. + # Example Here is a full example for reference. @@ -336,8 +354,26 @@ Here is a full example for reference. "selinux": { "enabled": true }, + "memoryPolicy": { + "modes": [ + "MPOL_DEFAULT", + "MPOL_BIND", + "MPOL_INTERLEAVE", + "MPOL_WEIGHTED_INTERLEAVE", + "MPOL_PREFERRED", + "MPOL_PREFERRED_MANY", + "MPOL_LOCAL" + ], + "flags": [ + "MPOL_F_NUMA_BALANCING", + "MPOL_F_RELATIVE_NODES", + "MPOL_F_STATIC_NODES" + ] + }, "intelRdt": { - "enabled": true + "enabled": true, + "schemata": true, + "monitoring": true } }, "annotations": { diff --git a/glossary.md b/glossary.md index f2f8383ad..845fd130b 100644 --- a/glossary.md +++ b/glossary.md @@ -46,7 +46,7 @@ Runtime callers often execute a runtime via [runc][]-compatible command line int On Linux, the namespaces from which new [container namespaces](#container-namespace) are [created](config-linux.md#namespaces) and from which some configured resources are accessed. [JSON]: https://tools.ietf.org/html/rfc8259 -[UTF-8]: http://www.unicode.org/versions/Unicode8.0.0/ch03.pdf +[UTF-8]: https://www.unicode.org/versions/Unicode8.0.0/ch03.pdf [runc]: https://github.com/opencontainers/runc -[namespaces.7]: http://man7.org/linux/man-pages/man7/namespaces.7.html +[namespaces.7]: https://man7.org/linux/man-pages/man7/namespaces.7.html diff --git a/implementations.md b/implementations.md index 8e0037b9b..51af41cb2 100644 --- a/implementations.md +++ b/implementations.md @@ -10,6 +10,7 @@ If you know of any associated projects that are not listed here, please file a p * [containers/youki][youki] - Runtime implementation in Rust * [opencontainers/runc][runc] - Reference implementation of OCI runtime * [projectatomic/bwrap-oci][bwrap-oci] - Convert the OCI spec file to a command line for [bubblewrap][bubblewrap] +* [systemd/systemd][systemd] - Contains [systemd-nspawn][nspawn], runtime implementation in C (via `--oci-bundle` option since systemd v242) ## Runtime (Virtual Machine) @@ -30,11 +31,13 @@ If you know of any associated projects that are not listed here, please file a p [crun]: https://github.com/containers/crun [gvisor]: https://github.com/google/gvisor [kata-runtime]: https://github.com/kata-containers/runtime +[nspawn]: https://www.freedesktop.org/software/systemd/man/latest/systemd-nspawn.html [oct]: https://github.com/huawei-openlab/oct [octool]: https://github.com/kunalkushwaha/octool [runc]: https://github.com/opencontainers/runc [rune]: https://github.com/alibaba/inclavare-containers [runtime-tools]: https://github.com/opencontainers/runtime-tools [runv]: https://github.com/hyperhq/runv +[systemd]: https://github.com/systemd/systemd [virtcontainers]: https://github.com/containers/virtcontainers -[youki]: https://github.com/containers/youki \ No newline at end of file +[youki]: https://github.com/containers/youki diff --git a/principles.md b/principles.md index 6c7696302..a204f905e 100644 --- a/principles.md +++ b/principles.md @@ -43,4 +43,4 @@ The process was slow, inefficient and cost a fortune - and was entirely differen Standard Containers make INDUSTRIAL-GRADE DELIVERY of software a reality. Leveraging all of the properties listed above, Standard Containers are enabling large and small enterprises to streamline and automate their software delivery pipelines. -Whether it is in-house devOps flows, or external customer-based software delivery mechanisms, Standard Containers are changing the way the community thinks about software packaging and delivery. +Whether it is in-house DevOps flows, or external customer-based software delivery mechanisms, Standard Containers are changing the way the community thinks about software packaging and delivery. diff --git a/runtime-linux.md b/runtime-linux.md index 16c6dbebb..99c3b992a 100644 --- a/runtime-linux.md +++ b/runtime-linux.md @@ -18,4 +18,4 @@ While creating the container (step 2 in the [lifecycle](runtime.md#lifecycle)), | /proc/self/fd/2 | /dev/stderr | -[socket-activated-containers]: http://0pointer.de/blog/projects/socket-activated-containers.html +[socket-activated-containers]: https://0pointer.de/blog/projects/socket-activated-containers.html diff --git a/runtime.md b/runtime.md index d3aaa9b14..3e111c63b 100644 --- a/runtime.md +++ b/runtime.md @@ -70,7 +70,7 @@ The lifecycle describes the timeline of events that happen from when a container If any `startContainer` hook fails, the runtime MUST [generate an error](#errors), stop the container, and continue the lifecycle at step 12. 8. The runtime MUST run the user-specified program, as specified by [`process`](config.md#process). 9. The [`poststart` hooks](config.md#poststart) MUST be invoked by the runtime. - If any `poststart` hook fails, the runtime MUST [log a warning](#warnings), but the remaining hooks and lifecycle continue as if the hook had succeeded. + If any `poststart` hook fails, the runtime MUST [generate an error](#errors), stop the container, and continue the lifecycle at step 12. 10. The container process exits. This MAY happen due to erroring out, exiting, crashing or the runtime's [`kill`](runtime.md#kill) operation being invoked. 11. Runtime's [`delete`](runtime.md#delete) command is invoked with the unique identifier of the container. diff --git a/schema/README.md b/schema/README.md index f8f7fb739..5ae1df5a4 100644 --- a/schema/README.md +++ b/schema/README.md @@ -2,7 +2,7 @@ ## Overview -This directory contains the [JSON Schema](http://json-schema.org/) for validating JSON covered by this specification. +This directory contains the [JSON Schema](https://json-schema.org) for validating JSON covered by this specification. The layout of the files is as follows: diff --git a/schema/config-linux.json b/schema/config-linux.json index 942679964..778561d89 100644 --- a/schema/config-linux.json +++ b/schema/config-linux.json @@ -9,6 +9,12 @@ "$ref": "defs-linux.json#/definitions/Device" } }, + "netDevices": { + "type": "object", + "additionalProperties": { + "$ref": "defs-linux.json#/definitions/NetDevice" + } + }, "uidMappings": { "type": "array", "items": { @@ -262,6 +268,9 @@ "closID": { "type": "string" }, + "schemata": { + "$ref": "defs.json#/definitions/ArrayOfStrings" + }, "l3CacheSchema": { "type": "string" }, @@ -269,11 +278,25 @@ "type": "string", "pattern": "^MB:[^\\n]*$" }, - "enableCMT": { + "enableMonitoring": { "type": "boolean" + } + } + }, + "memoryPolicy": { + "type": "object", + "properties": { + "mode": { + "$ref": "defs-linux.json#/definitions/MemoryPolicyMode" }, - "enableMBM": { - "type": "boolean" + "nodes": { + "type": "string" + }, + "flags": { + "type": "array", + "items": { + "$ref": "defs-linux.json#/definitions/MemoryPolicyFlag" + } } } }, diff --git a/schema/config-schema.json b/schema/config-schema.json index 8f2bff772..5124def5f 100644 --- a/schema/config-schema.json +++ b/schema/config-schema.json @@ -181,11 +181,11 @@ "priority": { "$ref": "defs.json#/definitions/int32" }, - "flags": { - "type": "array", - "items": { - "$ref": "defs-linux.json#/definitions/SchedulerFlag" - } + "flags": { + "type": "array", + "items": { + "$ref": "defs-linux.json#/definitions/SchedulerFlag" + } }, "runtime": { "$ref": "defs.json#/definitions/uint64" @@ -220,6 +220,19 @@ } } } + }, + "execCPUAffinity": { + "type": "object", + "properties": { + "initial": { + "type": "string", + "pattern": "^[0-9, -]*$" + }, + "final": { + "type": "string", + "pattern": "^[0-9, -]*$" + } + } } } }, diff --git a/schema/config-windows.json b/schema/config-windows.json index 68b51e902..7cc1594f0 100644 --- a/schema/config-windows.json +++ b/schema/config-windows.json @@ -38,6 +38,17 @@ }, "maximum": { "$ref": "defs.json#/definitions/uint16" + }, + "affinity": { + "type": "object", + "properties": { + "mask": { + "$ref": "defs.json#/definitions/uint64" + }, + "group": { + "$ref": "defs.json#/definitions/uint32" + } + } } } }, diff --git a/schema/config-zos.json b/schema/config-zos.json index 971056923..13cabfca3 100644 --- a/schema/config-zos.json +++ b/schema/config-zos.json @@ -3,10 +3,14 @@ "description": "z/OS platform-specific configurations", "type": "object", "properties": { - "devices": { + "namespaces": { "type": "array", "items": { - "$ref": "defs-zos.json#/definitions/Device" + "anyOf": [ + { + "$ref": "defs-zos.json#/definitions/NamespaceReference" + } + ] } } } diff --git a/schema/defs-linux.json b/schema/defs-linux.json index ce43ecf96..ec34445e0 100644 --- a/schema/defs-linux.json +++ b/schema/defs-linux.json @@ -35,6 +35,8 @@ "SCMP_ARCH_X32", "SCMP_ARCH_ARM", "SCMP_ARCH_AARCH64", + "SCMP_ARCH_LOONGARCH64", + "SCMP_ARCH_M68K", "SCMP_ARCH_MIPS", "SCMP_ARCH_MIPS64", "SCMP_ARCH_MIPS64N32", @@ -46,6 +48,8 @@ "SCMP_ARCH_PPC64LE", "SCMP_ARCH_S390", "SCMP_ARCH_S390X", + "SCMP_ARCH_SH", + "SCMP_ARCH_SHEB", "SCMP_ARCH_PARISC", "SCMP_ARCH_PARISC64", "SCMP_ARCH_RISCV64" @@ -185,6 +189,14 @@ } } }, + "NetDevice": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + }, "weight": { "$ref": "defs.json#/definitions/uint16" }, @@ -260,6 +272,26 @@ "allow" ] }, + "MemoryPolicyMode": { + "type": "string", + "enum": [ + "MPOL_DEFAULT", + "MPOL_BIND", + "MPOL_INTERLEAVE", + "MPOL_WEIGHTED_INTERLEAVE", + "MPOL_PREFERRED", + "MPOL_PREFERRED_MANY", + "MPOL_LOCAL" + ] + }, + "MemoryPolicyFlag": { + "type": "string", + "enum": [ + "MPOL_F_NUMA_BALANCING", + "MPOL_F_RELATIVE_NODES", + "MPOL_F_STATIC_NODES" + ] + }, "NetworkInterfacePriority": { "type": "object", "properties": { diff --git a/schema/defs-zos.json b/schema/defs-zos.json index f0deee9c1..e15e281af 100644 --- a/schema/defs-zos.json +++ b/schema/defs-zos.json @@ -1,55 +1,27 @@ { "definitions": { - "Major": { - "description": "major device number", - "$ref": "defs.json#/definitions/int64" - }, - "Minor": { - "description": "minor device number", - "$ref": "defs.json#/definitions/int64" - }, - "FileMode": { - "description": "File permissions mode (typically an octal value)", - "type": "integer", - "minimum": 0, - "maximum": 512 - }, - "FileType": { - "description": "Type of a block or special character device", + "NamespaceType": { "type": "string", - "pattern": "^[cbup]$" + "enum": [ + "mount", + "pid", + "uts", + "ipc" + ] }, - "Device": { + "NamespaceReference": { "type": "object", - "required": [ - "type", - "path", - "major", - "minor" - ], "properties": { - "path": { - "$ref": "defs.json#/definitions/FilePath" - }, "type": { - "$ref": "#/definitions/FileType" + "$ref": "#/definitions/NamespaceType" }, - "major": { - "$ref": "#/definitions/Major" - }, - "minor": { - "$ref": "#/definitions/Minor" - }, - "fileMode": { - "$ref": "#/definitions/FileMode" - }, - "uid": { - "$ref": "defs.json#/definitions/UID" - }, - "gid": { - "$ref": "defs.json#/definitions/GID" + "path": { + "$ref": "defs.json#/definitions/FilePath" } - } + }, + "required": [ + "type" + ] } } } diff --git a/schema/features-linux.json b/schema/features-linux.json index 723ee67b8..fcf3df7d6 100644 --- a/schema/features-linux.json +++ b/schema/features-linux.json @@ -97,6 +97,27 @@ "type": "boolean" } } + }, + "mountExtensions": { + "type": "object", + "properties": { + "idmap": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + } + } + }, + "netDevices": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } } } } diff --git a/schema/features-schema.json b/schema/features-schema.json index 30246fa5b..3ae17a7a5 100644 --- a/schema/features-schema.json +++ b/schema/features-schema.json @@ -18,7 +18,10 @@ "annotations": { "$ref": "defs.json#/definitions/annotations" }, - "linux": { + "potentiallyUnsafeConfigAnnotations": { + "$ref": "defs.json#/definitions/ArrayOfStrings" + }, + "linux": { "$ref": "features-linux.json#/linux" } }, diff --git a/schema/test/config/bad/linux-netdevice.json b/schema/test/config/bad/linux-netdevice.json new file mode 100644 index 000000000..618d88432 --- /dev/null +++ b/schema/test/config/bad/linux-netdevice.json @@ -0,0 +1,13 @@ +{ + "ociVersion": "1.0.0", + "root": { + "path": "rootfs" + }, + "linux": { + "netDevices": { + "eth0": { + "name": 23 + } + } + } +} diff --git a/schema/test/config/good/linux-netdevice.json b/schema/test/config/good/linux-netdevice.json new file mode 100644 index 000000000..cec4d09aa --- /dev/null +++ b/schema/test/config/good/linux-netdevice.json @@ -0,0 +1,15 @@ +{ + "ociVersion": "1.0.0", + "root": { + "path": "rootfs" + }, + "linux": { + "netDevices": { + "eth0": { + "name": "container_eth0" + }, + "ens4": {}, + "ens5": {} + } + } +} diff --git a/schema/test/config/good/spec-example.json b/schema/test/config/good/spec-example.json index fe56e54c8..5b9ad01dc 100644 --- a/schema/test/config/good/spec-example.json +++ b/schema/test/config/good/spec-example.json @@ -159,8 +159,14 @@ "createRuntime": [ { "path": "/usr/bin/fix-mounts", - "args": ["fix-mounts", "arg1", "arg2"], - "env": [ "key1=value1"] + "args": [ + "fix-mounts", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] }, { "path": "/usr/bin/setup-network" @@ -169,8 +175,14 @@ "createContainer": [ { "path": "/usr/bin/mount-hook", - "args": ["-mount", "arg1", "arg2"], - "env": [ "key1=value1"] + "args": [ + "-mount", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] } ], "startContainer": [ diff --git a/schema/test/config/good/zos-example.json b/schema/test/config/good/zos-example.json new file mode 100644 index 000000000..cb9cfca61 --- /dev/null +++ b/schema/test/config/good/zos-example.json @@ -0,0 +1,138 @@ +{ + "ociVersion": "0.5.0-dev", + "process": { + "terminal": true, + "user": { + "uid": 1, + "gid": 1, + "additionalGids": [ + 5, + 6 + ] + }, + "args": [ + "sh" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "rootfs" + }, + "hostname": "slartibartfast", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tfs", + "source": "tmpfs", + "options": [ + "nosuid", + "-p 1755", + "-s 64" + ] + } + ], + "hooks": { + "prestart": [ + { + "path": "/usr/bin/fix-mounts", + "args": [ + "fix-mounts", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] + }, + { + "path": "/usr/bin/setup-network" + } + ], + "createRuntime": [ + { + "path": "/usr/bin/fix-mounts", + "args": [ + "fix-mounts", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] + }, + { + "path": "/usr/bin/setup-network" + } + ], + "createContainer": [ + { + "path": "/usr/bin/mount-hook", + "args": [ + "-mount", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] + } + ], + "startContainer": [ + { + "path": "/usr/bin/refresh-ldcache" + } + ], + "poststart": [ + { + "path": "/usr/bin/notify-start", + "timeout": 5 + } + ], + "poststop": [ + { + "path": "/usr/sbin/cleanup.sh", + "args": [ + "cleanup.sh", + "-f" + ] + } + ] + }, + "zos": { + "namespaces": [ + { + "type": "pid" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ] + }, + "annotations": { + "com.example.key1": "value1", + "com.example.key2": "value2" + } +} diff --git a/schema/test/config/good/zos-minimal.json b/schema/test/config/good/zos-minimal.json index 94d22372a..4647eb1ab 100644 --- a/schema/test/config/good/zos-minimal.json +++ b/schema/test/config/good/zos-minimal.json @@ -3,6 +3,5 @@ "root": { "path": "rootfs" }, - "zos": { - } + "zos": {} } diff --git a/schema/test/features/good/runc.json b/schema/test/features/good/runc.json index 43940a701..fa6de7f97 100644 --- a/schema/test/features/good/runc.json +++ b/schema/test/features/good/runc.json @@ -171,10 +171,10 @@ "SCMP_ARCH_X86_64" ], "knownFlags": [ - "SECCOMP_FILTER_FLAG_LOG" + "SECCOMP_FILTER_FLAG_LOG" ], "supportedFlags": [ - "SECCOMP_FILTER_FLAG_LOG" + "SECCOMP_FILTER_FLAG_LOG" ] }, "apparmor": { @@ -182,6 +182,9 @@ }, "selinux": { "enabled": true + }, + "netDevices": { + "enabled": true } }, "annotations": { diff --git a/spec.md b/spec.md index 709702532..0eb1fa877 100644 --- a/spec.md +++ b/spec.md @@ -47,6 +47,6 @@ An implementation is not compliant for a given CPU architecture if it fails to s An implementation is compliant for a given CPU architecture if it satisfies all the MUST, REQUIRED, and SHALL requirements for the [platforms](#platforms) it implements. -[c99-unspecified]: http://www.open-std.org/jtc1/sc22/wg14/www/C99RationaleV5.10.pdf#page=18 -[oci]: http://www.opencontainers.org +[c99-unspecified]: https://www.open-std.org/jtc1/sc22/wg14/www/C99RationaleV5.10.pdf#page=18 +[oci]: https://opencontainers.org [rfc2119]: https://www.rfc-editor.org/rfc/rfc2119.html diff --git a/specs-go/config.go b/specs-go/config.go index 4e7717d53..cab0fd8db 100644 --- a/specs-go/config.go +++ b/specs-go/config.go @@ -83,7 +83,7 @@ type Process struct { // Rlimits specifies rlimit options to apply to the process. Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris,zos"` // NoNewPrivileges controls whether additional privileges could be gained by processes in the container. - NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"` + NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux,zos"` // ApparmorProfile specifies the apparmor profile for the container. ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"` // Specify an oom_score_adj for the container. @@ -94,10 +94,12 @@ type Process struct { SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"` // IOPriority contains the I/O priority settings for the cgroup. IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"` + // ExecCPUAffinity specifies CPU affinity for exec processes. + ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"` } // LinuxCapabilities specifies the list of allowed capabilities that are kept for a process. -// http://man7.org/linux/man-pages/man7/capabilities.7.html +// https://man7.org/linux/man-pages/man7/capabilities.7.html type LinuxCapabilities struct { // Bounding is the set of capabilities checked by the kernel. Bounding []string `json:"bounding,omitempty" platform:"linux"` @@ -127,6 +129,12 @@ const ( IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE" ) +// CPUAffinity specifies process' CPU affinity. +type CPUAffinity struct { + Initial string `json:"initial,omitempty"` + Final string `json:"final,omitempty"` +} + // Box specifies dimensions of a rectangle. Used for specifying the size of a console. type Box struct { // Height is the vertical dimension of a box. @@ -187,6 +195,10 @@ type Hook struct { type Hooks struct { // Prestart is Deprecated. Prestart is a list of hooks to be run before the container process is executed. // It is called in the Runtime Namespace + // + // Deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and + // [Hooks.StartContainer] instead, which allow more granular hook control + // during the create and start phase. Prestart []Hook `json:"prestart,omitempty"` // CreateRuntime is a list of hooks to be run after the container has been created but before pivot_root or any equivalent operation has been called // It is called in the Runtime Namespace @@ -224,6 +236,8 @@ type Linux struct { Namespaces []LinuxNamespace `json:"namespaces,omitempty"` // Devices are a list of device nodes that are created for the container Devices []LinuxDevice `json:"devices,omitempty"` + // NetDevices are key-value pairs, keyed by network device name on the host, moved to the container's network namespace. + NetDevices map[string]LinuxNetDevice `json:"netDevices,omitempty"` // Seccomp specifies the seccomp security settings for the container. Seccomp *LinuxSeccomp `json:"seccomp,omitempty"` // RootfsPropagation is the rootfs mount propagation mode for the container. @@ -237,6 +251,8 @@ type Linux struct { // IntelRdt contains Intel Resource Director Technology (RDT) information for // handling resource constraints and monitoring metrics (e.g., L3 cache, memory bandwidth) for the container IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"` + // MemoryPolicy contains NUMA memory policy for the container. + MemoryPolicy *LinuxMemoryPolicy `json:"memoryPolicy,omitempty"` // Personality contains configuration for the Linux personality syscall Personality *LinuxPersonality `json:"personality,omitempty"` // TimeOffsets specifies the offset for supporting time namespaces. @@ -371,6 +387,12 @@ type LinuxMemory struct { // Total memory limit (memory + swap). Swap *int64 `json:"swap,omitempty"` // Kernel memory limit (in bytes). + // + // Deprecated: kernel-memory limits are not supported in cgroups v2, and + // were obsoleted in [kernel v5.4]. This field should no longer be used, + // as it may be ignored by runtimes. + // + // [kernel v5.4]: https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0 Kernel *int64 `json:"kernel,omitempty"` // Kernel memory limit for tcp (in bytes) KernelTCP *int64 `json:"kernelTCP,omitempty"` @@ -412,7 +434,7 @@ type LinuxCPU struct { // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) type LinuxPids struct { // Maximum number of PIDs. Default is "no limit". - Limit int64 `json:"limit"` + Limit *int64 `json:"limit,omitempty"` } // LinuxNetwork identification and priority configuration @@ -473,6 +495,12 @@ type LinuxDevice struct { GID *uint32 `json:"gid,omitempty"` } +// LinuxNetDevice represents a single network device to be added to the container's network namespace +type LinuxNetDevice struct { + // Name of the device in the container namespace + Name string `json:"name,omitempty"` +} + // LinuxDeviceCgroup represents a device rule for the devices specified to // the device controller type LinuxDeviceCgroup struct { @@ -617,6 +645,17 @@ type WindowsCPUResources struct { // cycles per 10,000 cycles. Set processor `maximum` to a percentage times // 100. Maximum *uint16 `json:"maximum,omitempty"` + // Set of CPUs to affinitize for this container. + Affinity []WindowsCPUGroupAffinity `json:"affinity,omitempty"` +} + +// Similar to _GROUP_AFFINITY struct defined in +// https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/miniport/ns-miniport-_group_affinity +type WindowsCPUGroupAffinity struct { + // CPU mask relative to this CPU group. + Mask uint64 `json:"mask,omitempty"` + // Processor group the mask refers to, as returned by GetLogicalProcessorInformationEx. + Group uint32 `json:"group,omitempty"` } // WindowsStorageResources contains storage resource management settings. @@ -741,6 +780,10 @@ const ( ArchPARISC Arch = "SCMP_ARCH_PARISC" ArchPARISC64 Arch = "SCMP_ARCH_PARISC64" ArchRISCV64 Arch = "SCMP_ARCH_RISCV64" + ArchLOONGARCH64 Arch = "SCMP_ARCH_LOONGARCH64" + ArchM68K Arch = "SCMP_ARCH_M68K" + ArchSH Arch = "SCMP_ARCH_SH" + ArchSHEB Arch = "SCMP_ARCH_SHEB" ) // LinuxSeccompAction taken upon Seccomp rule match @@ -795,49 +838,92 @@ type LinuxSyscall struct { type LinuxIntelRdt struct { // The identity for RDT Class of Service ClosID string `json:"closID,omitempty"` + + // Schemata specifies the complete schemata to be written as is to the + // schemata file in resctrl fs. Each element represents a single line in the schemata file. + // NOTE: This will overwrite schemas specified in the L3CacheSchema and/or + // MemBwSchema fields. + Schemata []string `json:"schemata,omitempty"` + // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." + // NOTE: Should not be specified if Schemata is non-empty. L3CacheSchema string `json:"l3CacheSchema,omitempty"` // The schema of memory bandwidth per L3 cache id // Format: "MB:=bandwidth0;=bandwidth1;..." // The unit of memory bandwidth is specified in "percentages" by // default, and in "MBps" if MBA Software Controller is enabled. + // NOTE: Should not be specified if Schemata is non-empty. MemBwSchema string `json:"memBwSchema,omitempty"` - // EnableCMT is the flag to indicate if the Intel RDT CMT is enabled. CMT (Cache Monitoring Technology) supports monitoring of - // the last-level cache (LLC) occupancy for the container. - EnableCMT bool `json:"enableCMT,omitempty"` + // EnableMonitoring enables resctrl monitoring for the container. This will + // create a dedicated resctrl monitoring group for the container. + EnableMonitoring bool `json:"enableMonitoring,omitempty"` +} + +// LinuxMemoryPolicy represents input for the set_mempolicy syscall. +type LinuxMemoryPolicy struct { + // Mode for the set_mempolicy syscall. + Mode MemoryPolicyModeType `json:"mode"` + + // Nodes representing the nodemask for the set_mempolicy syscall in comma separated ranges format. + // Format: "-,,-,..." + Nodes string `json:"nodes"` - // EnableMBM is the flag to indicate if the Intel RDT MBM is enabled. MBM (Memory Bandwidth Monitoring) supports monitoring of - // total and local memory bandwidth for the container. - EnableMBM bool `json:"enableMBM,omitempty"` + // Flags for the set_mempolicy syscall. + Flags []MemoryPolicyFlagType `json:"flags,omitempty"` } // ZOS contains platform-specific configuration for z/OS based containers. type ZOS struct { - // Devices are a list of device nodes that are created for the container - Devices []ZOSDevice `json:"devices,omitempty"` + // Namespaces contains the namespaces that are created and/or joined by the container + Namespaces []ZOSNamespace `json:"namespaces,omitempty"` } -// ZOSDevice represents the mknod information for a z/OS special device file -type ZOSDevice struct { - // Path to the device. - Path string `json:"path"` - // Device type, block, char, etc. - Type string `json:"type"` - // Major is the device's major number. - Major int64 `json:"major"` - // Minor is the device's minor number. - Minor int64 `json:"minor"` - // FileMode permission bits for the device. - FileMode *os.FileMode `json:"fileMode,omitempty"` - // UID of the device. - UID *uint32 `json:"uid,omitempty"` - // Gid of the device. - GID *uint32 `json:"gid,omitempty"` +// ZOSNamespace is the configuration for a z/OS namespace +type ZOSNamespace struct { + // Type is the type of namespace + Type ZOSNamespaceType `json:"type"` + // Path is a path to an existing namespace persisted on disk that can be joined + // and is of the same type + Path string `json:"path,omitempty"` } +// ZOSNamespaceType is one of the z/OS namespaces +type ZOSNamespaceType string + +const ( + // PIDNamespace for isolating process IDs + ZOSPIDNamespace ZOSNamespaceType = "pid" + // MountNamespace for isolating mount points + ZOSMountNamespace ZOSNamespaceType = "mount" + // IPCNamespace for isolating System V IPC, POSIX message queues + ZOSIPCNamespace ZOSNamespaceType = "ipc" + // UTSNamespace for isolating hostname and NIS domain name + ZOSUTSNamespace ZOSNamespaceType = "uts" +) + +type MemoryPolicyModeType string + +const ( + MpolDefault MemoryPolicyModeType = "MPOL_DEFAULT" + MpolBind MemoryPolicyModeType = "MPOL_BIND" + MpolInterleave MemoryPolicyModeType = "MPOL_INTERLEAVE" + MpolWeightedInterleave MemoryPolicyModeType = "MPOL_WEIGHTED_INTERLEAVE" + MpolPreferred MemoryPolicyModeType = "MPOL_PREFERRED" + MpolPreferredMany MemoryPolicyModeType = "MPOL_PREFERRED_MANY" + MpolLocal MemoryPolicyModeType = "MPOL_LOCAL" +) + +type MemoryPolicyFlagType string + +const ( + MpolFNumaBalancing MemoryPolicyFlagType = "MPOL_F_NUMA_BALANCING" + MpolFRelativeNodes MemoryPolicyFlagType = "MPOL_F_RELATIVE_NODES" + MpolFStaticNodes MemoryPolicyFlagType = "MPOL_F_STATIC_NODES" +) + // LinuxSchedulerPolicy represents different scheduling policies used with the Linux Scheduler type LinuxSchedulerPolicy string diff --git a/specs-go/features/features.go b/specs-go/features/features.go index 230e88f56..7b4c40640 100644 --- a/specs-go/features/features.go +++ b/specs-go/features/features.go @@ -24,6 +24,12 @@ type Features struct { // Annotations contains implementation-specific annotation strings, // such as the implementation version, and third-party extensions. Annotations map[string]string `json:"annotations,omitempty"` + + // PotentiallyUnsafeConfigAnnotations the list of the potential unsafe annotations + // that may appear in `config.json`. + // + // A value that ends with "." is interpreted as a prefix of annotations. + PotentiallyUnsafeConfigAnnotations []string `json:"potentiallyUnsafeConfigAnnotations,omitempty"` } // Linux is specific to Linux. @@ -36,11 +42,14 @@ type Linux struct { // Nil value means "unknown", not "no support for any capability". Capabilities []string `json:"capabilities,omitempty"` - Cgroup *Cgroup `json:"cgroup,omitempty"` - Seccomp *Seccomp `json:"seccomp,omitempty"` - Apparmor *Apparmor `json:"apparmor,omitempty"` - Selinux *Selinux `json:"selinux,omitempty"` - IntelRdt *IntelRdt `json:"intelRdt,omitempty"` + Cgroup *Cgroup `json:"cgroup,omitempty"` + Seccomp *Seccomp `json:"seccomp,omitempty"` + Apparmor *Apparmor `json:"apparmor,omitempty"` + Selinux *Selinux `json:"selinux,omitempty"` + IntelRdt *IntelRdt `json:"intelRdt,omitempty"` + MemoryPolicy *MemoryPolicy `json:"memoryPolicy,omitempty"` + MountExtensions *MountExtensions `json:"mountExtensions,omitempty"` + NetDevices *NetDevices `json:"netDevices,omitempty"` } // Cgroup represents the "cgroup" field. @@ -122,4 +131,39 @@ type IntelRdt struct { // Unrelated to whether the host supports Intel RDT or not. // Nil value means "unknown", not "false". Enabled *bool `json:"enabled,omitempty"` + // Schemata is true if the "linux.intelRdt.enableMonitoring" field of the + // spec is implemented. + Schemata *bool `json:"schemata,omitempty"` + // Monitoring is true if the "linux.intelRdt.enableMonitoring" field of the + // spec is implemented. + // Nil value means "unknown", not "false". + Monitoring *bool `json:"monitoring,omitempty"` +} + +// MemoryPolicy represents the "memoryPolicy" field. +type MemoryPolicy struct { + // modes is the list of known memory policy modes, e.g., "MPOL_INTERLEAVE". + Modes []string `json:"modes,omitempty"` + // flags is the list of known memory policy mode flags, e.g., "MPOL_F_STATIC_NODES". + Flags []string `json:"flags,omitempty"` +} + +// MountExtensions represents the "mountExtensions" field. +type MountExtensions struct { + // IDMap represents the status of idmap mounts support. + IDMap *IDMap `json:"idmap,omitempty"` +} + +type IDMap struct { + // Enabled represents whether idmap mounts supports is compiled in. + // Unrelated to whether the host supports it or not. + // Nil value means "unknown", not "false". + Enabled *bool `json:"enabled,omitempty"` +} + +// NetDevices represents the "netDevices" field. +type NetDevices struct { + // Enabled is true if network devices support is compiled in. + // Nil value means "unknown", not "false". + Enabled *bool `json:"enabled,omitempty"` } diff --git a/specs-go/version.go b/specs-go/version.go index b3fca349c..b0a00466b 100644 --- a/specs-go/version.go +++ b/specs-go/version.go @@ -6,12 +6,12 @@ const ( // VersionMajor is for an API incompatible changes VersionMajor = 1 // VersionMinor is for functionality in a backwards-compatible manner - VersionMinor = 1 + VersionMinor = 2 // VersionPatch is for backwards-compatible bug fixes - VersionPatch = 0 + VersionPatch = 1 // VersionDev indicates development branch. Releases will be empty string. - VersionDev = "" + VersionDev = "+dev" ) // Version is the specification version that the package types support.