diff --git a/go.mod b/go.mod index dd18d1fa7a7..5a16982dd6b 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/godbus/dbus/v5 v5.1.0 github.com/moby/sys/mountinfo v0.6.2 github.com/mrunalp/fileutils v0.5.0 - github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 + github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b github.com/opencontainers/selinux v1.10.1 github.com/seccomp/libseccomp-golang v0.10.0 github.com/sirupsen/logrus v1.8.1 diff --git a/go.sum b/go.sum index 3c81d97bcf9..4f2794ae064 100644 --- a/go.sum +++ b/go.sum @@ -36,8 +36,8 @@ github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vyg github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= -github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc= -github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b h1:udwtfS44rxYE/ViMLchHQBjfE60GZSB1arY7BFbyxLs= +github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.10.1 h1:09LIPVRP3uuZGQvgR+SgMSNBd1Eb3vlRbGqQpoHsF8w= github.com/opencontainers/selinux v1.10.1/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 7cf2fb65751..18955cf95c9 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -31,12 +31,13 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` - DefaultErrnoRet *uint `json:"default_errno_ret"` - ListenerPath string `json:"listener_path,omitempty"` - ListenerMetadata string `json:"listener_metadata,omitempty"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Flags []specs.LinuxSeccompFlag `json:"flags"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` } // Action is taken upon rule match in Seccomp diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index 8c12af72be9..ffbe537e7e3 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -13,6 +13,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/seccomp/patchbpf" + "github.com/opencontainers/runtime-spec/specs-go" ) var ( @@ -86,6 +87,29 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { } } + // Add extra flags + for _, flag := range config.Flags { + switch flag { + case "SECCOMP_FILTER_FLAG_TSYNC": + // libseccomp-golang always use filterAttrTsync when + // possible so all goroutines will receive the same + // rules, so there is nothing to do. It does not make + // sense to apply the seccomp filter on only one + // thread; other threads will be terminated after exec + // anyway. + case specs.LinuxSeccompFlagLog: + if err := filter.SetLogBit(true); err != nil { + return -1, fmt.Errorf("error adding log flag to seccomp filter: %w", err) + } + case specs.LinuxSeccompFlagSpecAllow: + if err := filter.SetSSB(true); err != nil { + return -1, fmt.Errorf("error adding SSB flag to seccomp filter: %w", err) + } + default: + return -1, fmt.Errorf("seccomp flags %q not yet supported by runc", flag) + } + } + // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { return -1, fmt.Errorf("error setting no new privileges: %w", err) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 7ff9f098d6a..55ccabeef08 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -1016,14 +1016,22 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { return nil, nil } - // We don't currently support seccomp flags. - if len(config.Flags) != 0 { - return nil, errors.New("seccomp flags are not yet supported by runc") - } - newConfig := new(configs.Seccomp) newConfig.Syscalls = []*configs.Syscall{} + // The list of flags defined in runtime-spec is a subset of the flags + // in the seccomp() syscall + for _, flag := range config.Flags { + switch flag { + case "SECCOMP_FILTER_FLAG_TSYNC": + // Tsync can be silently ignored + case specs.LinuxSeccompFlagLog, specs.LinuxSeccompFlagSpecAllow: + newConfig.Flags = append(newConfig.Flags, flag) + default: + return nil, fmt.Errorf("seccomp flag %q not yet supported by runc", flag) + } + } + if len(config.Architectures) > 0 { newConfig.Architectures = []string{} for _, arch := range config.Architectures { diff --git a/tests/integration/seccomp.bats b/tests/integration/seccomp.bats index c24eeb26a0f..55e6dc817ab 100644 --- a/tests/integration/seccomp.bats +++ b/tests/integration/seccomp.bats @@ -66,6 +66,38 @@ function teardown() { [[ "$output" == *"Network is down"* ]] } +@test "runc run [seccomp] (SECCOMP_FILTER_FLAG_*)" { + # Linux 4.14: SECCOMP_FILTER_FLAG_LOG + # Linux 4.17: SECCOMP_FILTER_FLAG_SPEC_ALLOW + requires_kernel 4.17 + SECCOMP_FILTER_FLAGS=( + '' # no flag + '"SECCOMP_FILTER_FLAG_LOG"' + '"SECCOMP_FILTER_FLAG_SPEC_ALLOW"' + '"SECCOMP_FILTER_FLAG_TSYNC"' + '"SECCOMP_FILTER_FLAG_LOG","SECCOMP_FILTER_FLAG_SPEC_ALLOW"' + '"SECCOMP_FILTER_FLAG_LOG","SECCOMP_FILTER_FLAG_TSYNC"' + '"SECCOMP_FILTER_FLAG_SPEC_ALLOW","SECCOMP_FILTER_FLAG_TSYNC"' + '"SECCOMP_FILTER_FLAG_LOG","SECCOMP_FILTER_FLAG_SPEC_ALLOW","SECCOMP_FILTER_FLAG_TSYNC"' + ) + for flags in "${SECCOMP_FILTER_FLAGS[@]}"; do + update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo"] + | .process.noNewPrivileges = false + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "architectures":["SCMP_ARCH_X86","SCMP_ARCH_X32","SCMP_ARCH_X86_64","SCMP_ARCH_AARCH64","SCMP_ARCH_ARM"], + "flags":['"${flags}"'], + "syscalls":[{"names":["mkdir"], "action":"SCMP_ACT_ERRNO"}] + }' + + # This test checks that the flags are accepted without errors but does + # not check they are effectively applied + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"mkdir:"*"/dev/shm/foo"*"Operation not permitted"* ]] + done +} + @test "runc run [seccomp] (SCMP_ACT_KILL)" { update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo"] | .process.noNewPrivileges = false diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go index 6a7a91e5596..cf1b338c81a 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -15,7 +15,7 @@ type Spec struct { // Mounts configures additional mounts (on top of Root). Mounts []Mount `json:"mounts,omitempty"` // Hooks configures callbacks for container lifecycle events. - Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris"` + Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris,zos"` // Annotations contains arbitrary metadata for the container. Annotations map[string]string `json:"annotations,omitempty"` @@ -27,6 +27,8 @@ type Spec struct { Windows *Windows `json:"windows,omitempty" platform:"windows"` // VM specifies configuration for virtual-machine-based containers. VM *VM `json:"vm,omitempty" platform:"vm"` + // ZOS is platform-specific configuration for z/OS based containers. + ZOS *ZOS `json:"zos,omitempty" platform:"zos"` } // Process contains information to start a specific application inside the container. @@ -49,7 +51,7 @@ type Process struct { // Capabilities are Linux capabilities that are kept for the process. Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` // Rlimits specifies rlimit options to apply to the process. - Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"` + Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris,zos"` // NoNewPrivileges controls whether additional privileges could be gained by processes in the container. NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"` // ApparmorProfile specifies the apparmor profile for the container. @@ -86,11 +88,11 @@ type Box struct { // User specifies specific user (and group) information for the container process. type User struct { // UID is the user id. - UID uint32 `json:"uid" platform:"linux,solaris"` + UID uint32 `json:"uid" platform:"linux,solaris,zos"` // GID is the group id. - GID uint32 `json:"gid" platform:"linux,solaris"` + GID uint32 `json:"gid" platform:"linux,solaris,zos"` // Umask is the umask for the init process. - Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris"` + Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris,zos"` // AdditionalGids are additional group ids set for the container's process. AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"` // Username is the user name. @@ -110,11 +112,16 @@ type Mount struct { // Destination is the absolute path where the mount will be placed in the container. Destination string `json:"destination"` // Type specifies the mount kind. - Type string `json:"type,omitempty" platform:"linux,solaris"` + Type string `json:"type,omitempty" platform:"linux,solaris,zos"` // Source specifies the source path of the mount. Source string `json:"source,omitempty"` // Options are fstab style mount options. Options []string `json:"options,omitempty"` + + // UID/GID mappings used for changing file owners w/o calling chown, fs should support it. + // Every mount point could have its own mapping. + UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty" platform:"linux"` + GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty" platform:"linux"` } // Hook specifies a command that is run at a particular event in the lifecycle of a container @@ -178,7 +185,7 @@ type Linux struct { // MountLabel specifies the selinux context for the mounts in the container. MountLabel string `json:"mountLabel,omitempty"` // IntelRdt contains Intel Resource Director Technology (RDT) information for - // handling resource constraints (e.g., L3 cache, memory bandwidth) for the container + // handling resource constraints and monitoring metrics (e.g., L3 cache, memory bandwidth) for the container IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"` // Personality contains configuration for the Linux personality syscall Personality *LinuxPersonality `json:"personality,omitempty"` @@ -250,8 +257,8 @@ type LinuxInterfacePriority struct { Priority uint32 `json:"priority"` } -// linuxBlockIODevice holds major:minor format supported in blkio cgroup -type linuxBlockIODevice struct { +// LinuxBlockIODevice holds major:minor format supported in blkio cgroup +type LinuxBlockIODevice struct { // Major is the device's major number. Major int64 `json:"major"` // Minor is the device's minor number. @@ -260,7 +267,7 @@ type linuxBlockIODevice struct { // LinuxWeightDevice struct holds a `major:minor weight` pair for weightDevice type LinuxWeightDevice struct { - linuxBlockIODevice + LinuxBlockIODevice // Weight is the bandwidth rate for the device. Weight *uint16 `json:"weight,omitempty"` // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, CFQ scheduler only @@ -269,7 +276,7 @@ type LinuxWeightDevice struct { // LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair type LinuxThrottleDevice struct { - linuxBlockIODevice + LinuxBlockIODevice // Rate is the IO rate limit per cgroup per device Rate uint64 `json:"rate"` } @@ -328,6 +335,8 @@ type LinuxCPU struct { Cpus string `json:"cpus,omitempty"` // List of memory nodes in the cpuset. Default is to use any available memory node. Mems string `json:"mems,omitempty"` + // cgroups are configured with minimum weight, 0: default behavior, 1: SCHED_IDLE. + Idle *int64 `json:"idle,omitempty"` } // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) @@ -522,11 +531,21 @@ type WindowsMemoryResources struct { // WindowsCPUResources contains CPU resource management settings. type WindowsCPUResources struct { - // Number of CPUs available to the container. + // Count is the number of CPUs available to the container. It represents the + // fraction of the configured processor `count` in a container in relation + // to the processors available in the host. The fraction ultimately + // determines the portion of processor cycles that the threads in a + // container can use during each scheduling interval, as the number of + // cycles per 10,000 cycles. Count *uint64 `json:"count,omitempty"` - // CPU shares (relative weight to other containers with cpu shares). + // Shares limits the share of processor time given to the container relative + // to other workloads on the processor. The processor `shares` (`weight` at + // the platform level) is a value between 0 and 10000. Shares *uint16 `json:"shares,omitempty"` - // Specifies the portion of processor cycles that this container can use as a percentage times 100. + // Maximum determines the portion of processor cycles that the threads in a + // container can use during each scheduling interval, as the number of + // cycles per 10,000 cycles. Set processor `maximum` to a percentage times + // 100. Maximum *uint16 `json:"maximum,omitempty"` } @@ -613,6 +632,19 @@ type Arch string // LinuxSeccompFlag is a flag to pass to seccomp(2). type LinuxSeccompFlag string +const ( + // LinuxSeccompFlagLog is a seccomp flag to request all returned + // actions except SECCOMP_RET_ALLOW to be logged. An administrator may + // override this filter flag by preventing specific actions from being + // logged via the /proc/sys/kernel/seccomp/actions_logged file. (since + // Linux 4.14) + LinuxSeccompFlagLog LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_LOG" + + // LinuxSeccompFlagSpecAllow can be used to disable Speculative Store + // Bypass mitigation. (since Linux 4.17) + LinuxSeccompFlagSpecAllow LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_SPEC_ALLOW" +) + // Additional architectures permitted to be used for system calls // By default only the native architecture of the kernel is permitted const ( @@ -683,8 +715,9 @@ type LinuxSyscall struct { Args []LinuxSeccompArg `json:"args,omitempty"` } -// LinuxIntelRdt has container runtime resource constraints for Intel RDT -// CAT and MBA features which introduced in Linux 4.10 and 4.12 kernel +// LinuxIntelRdt has container runtime resource constraints for Intel RDT CAT and MBA +// features and flags enabling Intel RDT CMT and MBM features. +// Intel RDT features are available in Linux 4.14 and newer kernel versions. type LinuxIntelRdt struct { // The identity for RDT Class of Service ClosID string `json:"closID,omitempty"` @@ -697,4 +730,36 @@ type LinuxIntelRdt struct { // The unit of memory bandwidth is specified in "percentages" by // default, and in "MBps" if MBA Software Controller is enabled. MemBwSchema string `json:"memBwSchema,omitempty"` + + // EnableCMT is the flag to indicate if the Intel RDT CMT is enabled. CMT (Cache Monitoring Technology) supports monitoring of + // the last-level cache (LLC) occupancy for the container. + EnableCMT bool `json:"enableCMT,omitempty"` + + // EnableMBM is the flag to indicate if the Intel RDT MBM is enabled. MBM (Memory Bandwidth Monitoring) supports monitoring of + // total and local memory bandwidth for the container. + EnableMBM bool `json:"enableMBM,omitempty"` +} + +// ZOS contains platform-specific configuration for z/OS based containers. +type ZOS struct { + // Devices are a list of device nodes that are created for the container + Devices []ZOSDevice `json:"devices,omitempty"` +} + +// ZOSDevice represents the mknod information for a z/OS special device file +type ZOSDevice struct { + // Path to the device. + Path string `json:"path"` + // Device type, block, char, etc. + Type string `json:"type"` + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. + Minor int64 `json:"minor"` + // FileMode permission bits for the device. + FileMode *os.FileMode `json:"fileMode,omitempty"` + // UID of the device. + UID *uint32 `json:"uid,omitempty"` + // Gid of the device. + GID *uint32 `json:"gid,omitempty"` } diff --git a/vendor/modules.txt b/vendor/modules.txt index 60fb5857e2e..28c13ef36a7 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -36,7 +36,7 @@ github.com/moby/sys/mountinfo # github.com/mrunalp/fileutils v0.5.0 ## explicit; go 1.13 github.com/mrunalp/fileutils -# github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 +# github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b ## explicit github.com/opencontainers/runtime-spec/specs-go # github.com/opencontainers/selinux v1.10.1