From afc23e33971b657c4a09c54b16c6139651171aad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Clerget?= Date: Fri, 30 Jun 2023 15:49:47 +0200 Subject: [PATCH] Set temporary single CPU affinity before cgroup cpuset transition. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This handles a corner case when joining a container having all the processes running exclusively on isolated CPU cores to force the kernel to schedule runc process on the first CPU core within the cgroups cpuset. The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic scheduling behavior by distributing tasks across CPU cores within the cgroups cpuset. Some intensive real-time application are relying on this deterministic behavior and use the first CPU core to run a slow thread while other CPU cores are fully used by real-time threads with SCHED_FIFO policy. Such applications prevents runc process from joining a container when the runc process is randomly scheduled on a CPU core owned by a real-time thread. Introduces isolated CPU affinity transition OCI runtime annotation org.opencontainers.runc.exec.isolated-cpu-affinity-transition to restore the behavior during runc exec. Fix issue with kernel >= 6.2 not resetting CPU affinity for container processes. Signed-off-by: Cédric Clerget --- docs/isolated-cpu-affinity-transition.md | 125 ++++++++ features.go | 1 + libcontainer/cgroups/cgroups.go | 4 + libcontainer/cgroups/fs/fs.go | 27 ++ libcontainer/cgroups/fs2/fs2.go | 28 ++ libcontainer/cgroups/systemd/v1.go | 4 + libcontainer/cgroups/systemd/v2.go | 4 + libcontainer/container_linux_test.go | 4 + libcontainer/process_linux.go | 269 +++++++++++++++++- libcontainer/process_linux_test.go | 232 +++++++++++++++ libcontainer/system/kernelparam/lookup.go | 41 +++ .../system/kernelparam/lookup_test.go | 60 ++++ tests/integration/exec.bats | 136 +++++++++ tests/integration/helpers.bash | 21 ++ 14 files changed, 954 insertions(+), 2 deletions(-) create mode 100644 docs/isolated-cpu-affinity-transition.md create mode 100644 libcontainer/process_linux_test.go create mode 100644 libcontainer/system/kernelparam/lookup.go create mode 100644 libcontainer/system/kernelparam/lookup_test.go diff --git a/docs/isolated-cpu-affinity-transition.md b/docs/isolated-cpu-affinity-transition.md new file mode 100644 index 00000000000..d2f3b12e899 --- /dev/null +++ b/docs/isolated-cpu-affinity-transition.md @@ -0,0 +1,125 @@ +## Isolated CPU affinity transition + +The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 +in 5.7 has affected a deterministic scheduling behavior by distributing tasks +across CPU cores within a cgroups cpuset. It means that `runc exec` might be +impacted under some circumstances, by example when a container has been +created within a cgroup cpuset entirely composed of isolated CPU cores +usually sets either with `nohz_full` and/or `isolcpus` kernel boot parameters. + +Some containerized real-time applications are relying on this deterministic +behavior and uses the first CPU core to run a slow thread while other CPU +cores are fully used by the real-time threads with SCHED_FIFO policy. +Such applications can prevent runc process from joining a container when the +runc process is randomly scheduled on a CPU core owned by a real-time thread. + +Runc introduces a way to restore this behavior by adding the following +annotation to the container runtime spec (`config.json`): + +`org.opencontainers.runc.exec.isolated-cpu-affinity-transition` + +This annotation can take one of those values: + +* `temporary` to temporarily set the runc process CPU affinity to the first +isolated CPU core of the container cgroup cpuset. +* `definitive`: to definitively set the runc process CPU affinity to the first +isolated CPU core of the container cgroup cpuset. + +For example: + +```json + "annotations": { + "org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary" + } +``` + +__WARNING:__ `definitive` requires a kernel >= 6.2, also works with RHEL 9 and +above. + +### How it works? + +When enabled and during `runc exec`, runc is looking for the `nohz_full` kernel +boot parameter value and considers the CPUs in the list as isolated, it doesn't +look for `isolcpus` boot parameter, it just assumes that `isolcpus` value is +identical to `nohz_full` when specified. If `nohz_full` parameter is not found, +runc also attempts to read the list from `/sys/devices/system/cpu/nohz_full`. + +Once it gets the isolated CPU list, it returns an eligible CPU core within the +container cgroup cpuset based on those heuristics: + +* when there is not cpuset cores: no eligible CPU +* when there is not isolated cores: no eligible CPU +* when cpuset cores are not in isolated core list: no eligible CPU +* when cpuset cores are all isolated cores: return the first CPU of the cpuset +* when cpuset cores are mixed between housekeeping/isolated cores: return the + first housekeeping CPU not in isolated CPUs. + +The returned CPU core is then used to set the `runc init` CPU affinity before +the container cgroup cpuset transition. + +#### Transition example + +`nohz_full` has the isolated cores `4-7`. A container has been created with +the cgroup cpuset `4-7` to only run on the isolated CPU cores 4 to 7. +`runc exec` is called by a process with CPU affinity set to `0-3` + +* with `temporary` transition: + + runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4-7) + +* with `definitive` transition: + + runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4) + +The difference between `temporary` and `definitive` is the container process +affinity, `definitive` will constraint the container process to run on the +first isolated CPU core of the cgroup cpuset, while `temporary` restore the +CPU affinity to match the container cgroup cpuset. + +`definitive` transition might be helpful when `nohz_full` is used without +`isolcpus` to avoid runc and container process to be a noisy neighbour for +real-time applications. + +### How to use it with Kubernetes? + +Kubernetes doesn't manage container directly, instead it uses the Container Runtime +Interface (CRI) to communicate with a software implementing this interface and responsible +to manage the lifecycle of containers. There are popular CRI implementations like Containerd +and CRI-O. Those implementations allows to pass pod annotations to the container runtime +via the container runtime spec. Currently runc is the runtime used by default for both. + +#### Containerd configuration + +Containerd CRI uses runc by default but requires an extra step to pass the annotation to runc. +You have to whitelist `org.opencontainers.runc.exec.isolated-cpu-affinity-transition` as a pod +annotation allowed to be passed to the container runtime in `/etc/containerd/config.toml`: + +```toml +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + base_runtime_spec = "/etc/containerd/cri-base.json" + pod_annotations = ["org.opencontainers.runc.exec.isolated-cpu-affinity-transition"] +``` + +#### CRI-O configuration + +CRI-O doesn't require any extra step, however some annotations could be excluded by +configuration. + +#### Pod deployment example + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: demo-pod + annotations: + org.opencontainers.runc.exec.isolated-cpu-affinity-transition: "temporary" +spec: + containers: + - name: demo + image: registry.com/demo:latest +``` diff --git a/features.go b/features.go index eff04c1b2d2..190b7d10422 100644 --- a/features.go +++ b/features.go @@ -68,6 +68,7 @@ var featuresCommand = cli.Command{ "bundle", "org.systemd.property.", // prefix form "org.criu.config", + "org.opencontainers.runc.exec.isolated-cpu-affinity-transition", }, } diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index b9ba889b7a0..d97c874eab6 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -71,4 +71,8 @@ type Manager interface { // OOMKillCount reports OOM kill count for the cgroup. OOMKillCount() (uint64, error) + + // GetEffectiveCPUs returns the effective CPUs of the cgroup, an empty + // value means that the cgroups cpuset subsystem/controller is not enabled. + GetEffectiveCPUs() string } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index d2decb127ca..723d18b7637 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -4,6 +4,8 @@ import ( "errors" "fmt" "os" + "path/filepath" + "strings" "sync" "golang.org/x/sys/unix" @@ -263,3 +265,28 @@ func (m *Manager) OOMKillCount() (uint64, error) { return c, err } + +func (m *Manager) GetEffectiveCPUs() string { + return GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) +} + +func GetEffectiveCPUs(cpusetPath string, cgroups *configs.Cgroup) string { + // Fast path. + if cgroups.CpusetCpus != "" { + return cgroups.CpusetCpus + } else if !strings.HasPrefix(cpusetPath, defaultCgroupRoot) { + return "" + } + + // Iterates until it goes to the cgroup root path. + // It's required for containers in which cpuset controller + // is not enabled, in this case a parent cgroup is used. + for path := cpusetPath; path != defaultCgroupRoot; path = filepath.Dir(path) { + cpus, err := fscommon.GetCgroupParamString(path, "cpuset.effective_cpus") + if err == nil { + return cpus + } + } + + return "" +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 0760be74b97..0a579b23e84 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -4,11 +4,13 @@ import ( "errors" "fmt" "os" + "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" ) type parseError = fscommon.ParseError @@ -32,6 +34,9 @@ func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) { if err != nil { return nil, err } + } else { + // Clean path for safety. + dirPath = utils.CleanPath(dirPath) } m := &Manager{ @@ -316,3 +321,26 @@ func CheckMemoryUsage(dirPath string, r *configs.Resources) error { return nil } + +func (m *Manager) GetEffectiveCPUs() string { + // Fast path. + if m.config.CpusetCpus != "" { + return m.config.CpusetCpus + } else if !strings.HasPrefix(m.dirPath, UnifiedMountpoint) { + return "" + } + + // Iterates until it goes outside of the cgroup root path. + // It's required for containers in which cpuset controller + // is not enabled, in this case a parent cgroup is used. + outsidePath := filepath.Dir(UnifiedMountpoint) + + for path := m.dirPath; path != outsidePath; path = filepath.Dir(path) { + cpus, err := fscommon.GetCgroupParamString(path, "cpuset.cpus.effective") + if err == nil { + return cpus + } + } + + return "" +} diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 8c64a5887a9..e04e35682cd 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -411,3 +411,7 @@ func (m *LegacyManager) Exists() bool { func (m *LegacyManager) OOMKillCount() (uint64, error) { return fs.OOMKillCount(m.Path("memory")) } + +func (m *LegacyManager) GetEffectiveCPUs() string { + return fs.GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) +} diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index b28ec6b22f2..2c4a8c4c3ac 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -514,3 +514,7 @@ func (m *UnifiedManager) Exists() bool { func (m *UnifiedManager) OOMKillCount() (uint64, error) { return m.fsMgr.OOMKillCount() } + +func (m *UnifiedManager) GetEffectiveCPUs() string { + return m.fsMgr.GetEffectiveCPUs() +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 99908376562..c76178d808a 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -69,6 +69,10 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { return configs.Thawed, nil } +func (m *mockCgroupManager) GetEffectiveCPUs() string { + return "" +} + type mockProcess struct { _pid int started uint64 diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 6d51eada2b3..93c0e313e00 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,11 +1,13 @@ package libcontainer import ( + "bytes" "context" "encoding/json" "errors" "fmt" "io" + "io/fs" "net" "os" "os/exec" @@ -21,10 +23,12 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/system/kernelparam" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -133,8 +137,58 @@ func (p *setnsProcess) start() (retErr error) { // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() - err := p.cmd.Start() - // close the child-side of the pipes (controlled by child) + + // When greater or equal to zero, it will set a temporary single CPU + // affinity before cgroup cpuset transition, this handles a corner + // case when joining a container having all the processes running + // exclusively on isolated CPU cores to force the kernel to schedule + // runc process on the first CPU core within the cgroups cpuset. + // The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 + // in 5.7 has affected this deterministic scheduling behavior by + // distributing tasks across CPU cores within the cgroups cpuset. + // Some intensive real-time application are relying on this + // deterministic behavior and use the first CPU core to run a slow + // thread while other CPU cores are fully used by real-time threads + // with SCHED_FIFO policy. Such applications prevent runc process + // from joining a container when the runc process is randomly + // scheduled on a CPU core owned by a real-time thread. + cpuAffinity := -1 + resetCPUAffinity := true + + if len(p.manager.GetPaths()) > 0 { + // Get the target container cgroup. + if cg, err := p.manager.GetCgroups(); err != nil { + // Close the pipe to not be blocked in the parent. + p.comm.closeChild() + return fmt.Errorf("getting container cgroups: %w", err) + } else if cg.CpusetCpus != "" { + definitive := false + + _, annotations := utils.Annotations(p.config.Config.Labels) + cpuAffinity, definitive, err = isolatedCPUAffinityTransition( + os.DirFS("/"), + cg.CpusetCpus, + annotations, + ) + if err != nil { + // Close the pipe to not be blocked in the parent. + p.comm.closeChild() + return fmt.Errorf("getting CPU affinity: %w", err) + } else if definitive { + resetCPUAffinity = false + } + } + } + + var err error + + if cpuAffinity < 0 { + err = p.cmd.Start() + } else { + err = startCommandWithCPUAffinity(p.cmd, cpuAffinity) + } + + // Close the write-side of the pipes (controlled by child). p.comm.closeChild() if err != nil { return fmt.Errorf("error starting setns process: %w", err) @@ -193,6 +247,18 @@ func (p *setnsProcess) start() (retErr error) { } } } + + if resetCPUAffinity { + // Fix the container process CPU affinity to match container cgroup cpuset, + // since kernel 6.2, the runc CPU affinity might affect the container process + // CPU affinity after cgroup cpuset transition, by example if runc is running + // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the + // resulting container process CPU affinity will be 1 instead of 1-2. + if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { + return fmt.Errorf("error resetting container process CPU affinity: %w", err) + } + } + if p.intelRdtPath != "" { // if Intel RDT "resource control" filesystem path exists _, err := os.Stat(p.intelRdtPath) @@ -744,6 +810,14 @@ func (p *initProcess) start() (retErr error) { if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) } + // Reset container process CPU affinity to match container cgroup cpuset, + // since kernel 6.2, the runc CPU affinity might affect the container process + // CPU affinity after cgroup cpuset transition, by example if runc is running + // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the + // resulting container process CPU affinity will be 1 instead of 1-2. + if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { + return fmt.Errorf("error resetting container process CPU affinity: %w", err) + } if p.intelRdtManager != nil { if err := p.intelRdtManager.Set(p.config.Config); err != nil { return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) @@ -995,5 +1069,196 @@ func setIOPriority(ioprio *configs.IOPriority) error { if errno != 0 { return fmt.Errorf("failed to set io priority: %w", errno) } + + return nil +} + +// isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics +// and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value. +func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) { + const ( + isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" + nohzFullParam = "nohz_full" + ) + + definitive := false + + transition := annotations[isolatedCPUAffinityTransitionAnnotation] + switch transition { + case "temporary": + case "definitive": + definitive = true + default: + if transition != "" { + return -1, false, fmt.Errorf( + "unknown transition value %q for annotation %s", + transition, isolatedCPUAffinityTransitionAnnotation, + ) + } + return -1, false, nil + } + + kernelParams, err := kernelparam.LookupKernelBootParameters( + rootFS, + nohzFullParam, + ) + if err != nil { + // If /proc/cmdline does not exist or isn't readable, continue to read + // nohz_full from sysfs below. + if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) { + return -1, false, err + } + } + + // First get nohz_full value from kernel boot params, if not + // present, get the value from sysfs, to cover the case where + // CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration + // tests not dependent on /sys/devices/system/cpu/nohz_full. + isolatedList := kernelParams[nohzFullParam] + if isolatedList == "" { + // Get the isolated CPU list, the error is not checked here because + // no matter what the error is, it returns without error the same way + // as with empty data. + isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full") + isolatedList = string(bytes.TrimSpace(isolatedData)) + if isolatedList == "" || isolatedList == "(null)" { + return -1, false, nil + } + } + + cpu, err := getEligibleCPU(cpusetList, isolatedList) + if err != nil { + return -1, false, fmt.Errorf("getting eligible cpu: %w", err) + } else if cpu == -1 { + definitive = false + } + + return cpu, definitive, nil +} + +// getEligibleCPU returns the first eligible CPU for CPU affinity before +// entering in a cgroup cpuset: +// - when there is not cpuset cores: no eligible CPU (-1) +// - when there is not isolated cores: no eligible CPU (-1) +// - when cpuset cores are not in isolated cores: no eligible CPU (-1) +// - when cpuset cores are all isolated cores: return the first CPU of the cpuset +// - when cpuset cores are mixed between housekeeping/isolated cores: return the +// first housekeeping CPU not in isolated CPUs. +func getEligibleCPU(cpusetList, isolatedList string) (int, error) { + if isolatedList == "" || cpusetList == "" { + return -1, nil + } + + // The target container has a cgroup cpuset, get the bit range. + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + isolatedBits, err := systemd.RangeToBits(isolatedList) + if err != nil { + return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) + } + + eligibleCore := -1 + isolatedCores := 0 + + // Start from cpu core #0. + currentCore := 0 + // Handle mixed sets. + mixed := false + + // CPU core start from the first slice element and bits are read + // from the least to the most significant bit. + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + if byteRange >= len(isolatedBits) { + // No more isolated cores. + break + } + for bit := 0; bit < 8; bit++ { + if cpusetBits[byteRange]&(1< 0 { + return eligibleCore, nil + } + } + currentCore++ + } + } + + // We have an eligible CPU if there is at least one isolated CPU in the cpuset. + if isolatedCores == 0 { + return -1, nil + } + + return eligibleCore, nil +} + +// startCommandWithCPUAffinity starts a command on a specific CPU if set. +func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error { + errCh := make(chan error) + defer close(errCh) + + // Use a goroutine to dedicate an OS thread. + go func() { + cpuSet := new(unix.CPUSet) + cpuSet.Zero() + cpuSet.Set(cpuAffinity) + + // Don't call runtime.UnlockOSThread to terminate the OS thread + // when goroutine exits. + runtime.LockOSThread() + + // Command inherits the CPU affinity. + if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil { + errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err) + return + } + + errCh <- cmd.Start() + }() + + return <-errCh +} + +// fixProcessCPUAffinity sets the CPU affinity of a container process +// to all CPUs allowed by container cgroup cpuset. +func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error { + cpusetList := manager.GetEffectiveCPUs() + if cpusetList == "" { + // If the cgroup cpuset is not present, the container will inherit + // this process CPU affinity, so it can return without further actions. + return nil + } + + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + processCPUSet := new(unix.CPUSet) + + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + for bit := 0; bit < 8; bit++ { + processCPUSet.Set(byteRange*8 + bit) + } + } + + if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil { + return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err) + } + return nil } diff --git a/libcontainer/process_linux_test.go b/libcontainer/process_linux_test.go new file mode 100644 index 00000000000..8303643967b --- /dev/null +++ b/libcontainer/process_linux_test.go @@ -0,0 +1,232 @@ +package libcontainer + +import ( + "io/fs" + "testing" + "testing/fstest" +) + +func TestIsolatedCPUAffinityTransition(t *testing.T) { + const isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" + + noAffinity := -1 + temporaryTransition := "temporary" + definitiveTransition := "definitive" + + tests := []struct { + name string + testFS fs.FS + cpuset string + expectedErr bool + expectedAffinityCore int + expectedDefinitiveTransition bool + annotations map[string]string + }{ + { + name: "no affinity", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: noAffinity, + expectedDefinitiveTransition: false, + }, + { + name: "affinity match with temporary transition", + cpuset: "3-4", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity match with temporary transition and nohz_full boot param", + cpuset: "3-4", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity match with definitive transition", + cpuset: "3-4", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: true, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: definitiveTransition, + }, + }, + { + name: "affinity match with definitive transition and nohz_full boot param", + cpuset: "3-4", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: true, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: definitiveTransition, + }, + }, + { + name: "affinity error with bad isolated set", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("bad_isolated_set\n")}, + }, + expectedErr: true, + expectedAffinityCore: noAffinity, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity error with bad isolated set for nohz_full boot param", + cpuset: "0-15", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=bad_isolated_set\n")}, + }, + expectedErr: true, + expectedAffinityCore: noAffinity, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "no affinity with null isolated set value", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("(null)\n")}, + }, + expectedAffinityCore: noAffinity, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, definitive, err := isolatedCPUAffinityTransition(tt.testFS, tt.cpuset, tt.annotations) + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedDefinitiveTransition != definitive { + t.Fatalf("expected reset affinity %t: got %t instead", tt.expectedDefinitiveTransition, definitive) + } else if tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} + +func TestGetEligibleCPU(t *testing.T) { + tests := []struct { + name string + cpuset string + isolset string + expectedErr bool + expectedAffinityCore int + expectedEligible bool + }{ + { + name: "no cpuset", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no isolated set", + cpuset: "0-15", + expectedEligible: false, + }, + { + name: "bad cpuset format", + cpuset: "core0 to core15", + isolset: "2-15,18-31,34-47", + expectedErr: true, + }, + { + name: "bad isolated set format", + cpuset: "0-15", + isolset: "core0 to core15", + expectedErr: true, + }, + { + name: "no eligible core", + cpuset: "0-1,16-17,32-33", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no eligible core inverted", + cpuset: "2-15,18-31,34-47", + isolset: "0-1,16-17,32-33", + expectedEligible: false, + }, + { + name: "eligible core mixed", + cpuset: "8-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 16, + }, + { + name: "eligible core #4", + cpuset: "4-7", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 4, + }, + { + name: "eligible core #40", + cpuset: "40-47", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 40, + }, + { + name: "eligible core #24", + cpuset: "24-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 24, + }, + { + name: "no eligible core small isolated set", + cpuset: "60-63", + isolset: "0-1", + expectedEligible: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, err := getEligibleCPU(tt.cpuset, tt.isolset) + eligible := affinityCore >= 0 + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedEligible && !eligible { + t.Fatalf("was expecting eligible core but no eligible core returned") + } else if !tt.expectedEligible && eligible { + t.Fatalf("was not expecting eligible core but got eligible core") + } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} diff --git a/libcontainer/system/kernelparam/lookup.go b/libcontainer/system/kernelparam/lookup.go new file mode 100644 index 00000000000..4cf452412ff --- /dev/null +++ b/libcontainer/system/kernelparam/lookup.go @@ -0,0 +1,41 @@ +package kernelparam + +import ( + "io/fs" + "strings" +) + +func runeFilter(c rune) bool { + return c < '!' || c > '~' +} + +// LookupKernelBootParameters returns the selected kernel parameters specified +// in the kernel command line. The parameters are returned as a map of key-value pairs. +func LookupKernelBootParameters(rootFS fs.FS, lookupParameters ...string) (map[string]string, error) { + cmdline, err := fs.ReadFile(rootFS, "proc/cmdline") + if err != nil { + return nil, err + } + + kernelParameters := make(map[string]string) + remaining := len(lookupParameters) + + for _, parameter := range strings.FieldsFunc(string(cmdline), runeFilter) { + if remaining == 0 { + break + } + idx := strings.IndexByte(parameter, '=') + if idx == -1 { + continue + } + for _, lookupParam := range lookupParameters { + if lookupParam == parameter[:idx] { + kernelParameters[lookupParam] = parameter[idx+1:] + remaining-- + break + } + } + } + + return kernelParameters, nil +} diff --git a/libcontainer/system/kernelparam/lookup_test.go b/libcontainer/system/kernelparam/lookup_test.go new file mode 100644 index 00000000000..9d906301eb4 --- /dev/null +++ b/libcontainer/system/kernelparam/lookup_test.go @@ -0,0 +1,60 @@ +package kernelparam + +import ( + "testing" + "testing/fstest" +) + +func TestLookupKernelBootParameters(t *testing.T) { + for _, test := range []struct { + cmdline string + lookupParameters []string + expectedKernelParameters map[string]string + }{ + { + cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", + lookupParameters: []string{"root"}, + expectedKernelParameters: map[string]string{ + "root": "/dev/sda1", + }, + }, + { + cmdline: "ro runc.kernel_parameter=a_value console=ttyS0 console=tty0", + lookupParameters: []string{"runc.kernel_parameter"}, + expectedKernelParameters: map[string]string{ + "runc.kernel_parameter": "a_value", + }, + }, + { + cmdline: "ro runc.kernel_parameter_a=value_a runc.kernel_parameter_b=value_a:value_b", + lookupParameters: []string{ + "runc.kernel_parameter_a", + "runc.kernel_parameter_b", + }, + expectedKernelParameters: map[string]string{ + "runc.kernel_parameter_a": "value_a", + "runc.kernel_parameter_b": "value_a:value_b", + }, + }, + { + cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", + lookupParameters: []string{"runc.kernel_parameter_a"}, + expectedKernelParameters: map[string]string{}, + }, + } { + params, err := LookupKernelBootParameters(fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte(test.cmdline + "\n")}, + }, test.lookupParameters...) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + if len(params) != len(test.expectedKernelParameters) { + t.Fatalf("expected %d parameters, got %d", len(test.expectedKernelParameters), len(params)) + } + for k, v := range test.expectedKernelParameters { + if params[k] != v { + t.Fatalf("expected parameter %s to be %s, got %s", k, v, params[k]) + } + } + } +} diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index a92a237809d..c0ea7f5dce6 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -340,3 +340,139 @@ EOF [ ${#lines[@]} -eq 1 ] [[ ${lines[0]} = *"exec /run.sh: no such file or directory"* ]] } + +@test "runc exec with isolated cpus affinity temporary transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set temporary isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_temporary_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_temporary_transition grep "Cpus_allowed_list:" /proc/self/status + + umount /proc/cmdline + + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} + +@test "runc exec with isolated cpus affinity definitive transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set definitive isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "definitive"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_definitive_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_definitive_transition grep "Cpus_allowed_list:" /proc/self/status + + umount /proc/cmdline + + [ "$status" -eq 0 ] + + load /etc/os-release + + # fix unbound variable in condition below + VERSION_ID=${VERSION_ID:-} + + allowed_cpus=$all_cpus + # use first cpu on systems with RHEL >= 9 or systems with kernel >= 6.2 + if [[ "${ID_LIKE:-}" =~ "rhel" && "${VERSION_ID%%.*}" -ge "9" ]] || is_kernel_gte 6.2; then + allowed_cpus="$(get_first_online_cpu)" + fi + + [[ "${lines[0]}" == "Cpus_allowed_list: $allowed_cpus" ]] +} + +@test "runc exec with isolated cpus affinity bad transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set a bad isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "bad"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_bad_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_bad_transition true + + umount /proc/cmdline + + [ "$status" -eq 255 ] +} + +@test "runc exec with taskset affinity [cgroup cpuset]" { + requires root cgroups_cpuset + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + taskset -p -c "$(get_first_online_cpu)" $$ + + runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset + [ "$status" -eq 0 ] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} + +@test "runc exec with taskset affinity [rootless cgroups_v2]" { + requires rootless cgroups_v2 + + local all_cpus + all_cpus="$(get_all_online_cpus)" + + taskset -p -c "$(get_first_online_cpu)" $$ + + runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset + [ "$status" -eq 0 ] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index e8a2894b915..6b836f15d41 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -354,6 +354,27 @@ function set_cgroup_mount_writable() { update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' } +# Helper function to get all online cpus. +function get_all_online_cpus() { + cat /sys/devices/system/cpu/online +} + +# Helper function to get the first online cpu. +function get_first_online_cpu() { + [[ $(get_all_online_cpus) =~ [^0-9]*([0-9]+)([-,][0-9]+)? ]] && echo "${BASH_REMATCH[1]}" +} + +# Helper function to set all cpus/mems in container cgroup cpuset. +function set_cgroup_cpuset_all_cpus() { + update_config ".linux.resources.cpu.cpus = \"$(get_all_online_cpus)\"" + + local mems + mems="$(cat /sys/devices/system/node/online 2>/dev/null || true)" + if [[ -n $mems ]]; then + update_config ".linux.resources.cpu.mems = \"$mems\"" + fi +} + # Fails the current test, providing the error given. function fail() { echo "$@" >&2