From a7dc8ff132d82b4d17ed52d90b0b9fe1b69a4b38 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 17 May 2024 17:21:15 -0700 Subject: [PATCH] Revert "Set temporary single CPU affinity..." There's too much logic here figuring out which CPUs to use. Runc is a low level tool and is not supposed to be that "smart". What's worse, this logic is executed on every exec, making it slower. Some of the logic in (*setnsProcess).start is executed even if no annotation is set, thus making ALL execs slow. Also, this should be a property of a process, rather than annotation. The plan is to rework this. This reverts commit afc23e33971b657c4a09c54b16c6139651171aad. Signed-off-by: Kir Kolyshkin --- docs/isolated-cpu-affinity-transition.md | 125 -------- features.go | 1 - libcontainer/cgroups/cgroups.go | 4 - libcontainer/cgroups/fs/fs.go | 27 -- libcontainer/cgroups/fs2/fs2.go | 28 -- libcontainer/cgroups/systemd/v1.go | 4 - libcontainer/cgroups/systemd/v2.go | 4 - libcontainer/cgroups/utils.go | 21 +- libcontainer/container_linux_test.go | 4 - libcontainer/process_linux.go | 269 +----------------- libcontainer/process_linux_test.go | 232 --------------- libcontainer/system/kernelparam/lookup.go | 41 --- .../system/kernelparam/lookup_test.go | 60 ---- tests/integration/exec.bats | 136 --------- tests/integration/helpers.bash | 21 -- 15 files changed, 16 insertions(+), 961 deletions(-) delete mode 100644 docs/isolated-cpu-affinity-transition.md delete mode 100644 libcontainer/process_linux_test.go delete mode 100644 libcontainer/system/kernelparam/lookup.go delete mode 100644 libcontainer/system/kernelparam/lookup_test.go diff --git a/docs/isolated-cpu-affinity-transition.md b/docs/isolated-cpu-affinity-transition.md deleted file mode 100644 index d2f3b12e899..00000000000 --- a/docs/isolated-cpu-affinity-transition.md +++ /dev/null @@ -1,125 +0,0 @@ -## Isolated CPU affinity transition - -The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 -in 5.7 has affected a deterministic scheduling behavior by distributing tasks -across CPU cores within a cgroups cpuset. It means that `runc exec` might be -impacted under some circumstances, by example when a container has been -created within a cgroup cpuset entirely composed of isolated CPU cores -usually sets either with `nohz_full` and/or `isolcpus` kernel boot parameters. - -Some containerized real-time applications are relying on this deterministic -behavior and uses the first CPU core to run a slow thread while other CPU -cores are fully used by the real-time threads with SCHED_FIFO policy. -Such applications can prevent runc process from joining a container when the -runc process is randomly scheduled on a CPU core owned by a real-time thread. - -Runc introduces a way to restore this behavior by adding the following -annotation to the container runtime spec (`config.json`): - -`org.opencontainers.runc.exec.isolated-cpu-affinity-transition` - -This annotation can take one of those values: - -* `temporary` to temporarily set the runc process CPU affinity to the first -isolated CPU core of the container cgroup cpuset. -* `definitive`: to definitively set the runc process CPU affinity to the first -isolated CPU core of the container cgroup cpuset. - -For example: - -```json - "annotations": { - "org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary" - } -``` - -__WARNING:__ `definitive` requires a kernel >= 6.2, also works with RHEL 9 and -above. - -### How it works? - -When enabled and during `runc exec`, runc is looking for the `nohz_full` kernel -boot parameter value and considers the CPUs in the list as isolated, it doesn't -look for `isolcpus` boot parameter, it just assumes that `isolcpus` value is -identical to `nohz_full` when specified. If `nohz_full` parameter is not found, -runc also attempts to read the list from `/sys/devices/system/cpu/nohz_full`. - -Once it gets the isolated CPU list, it returns an eligible CPU core within the -container cgroup cpuset based on those heuristics: - -* when there is not cpuset cores: no eligible CPU -* when there is not isolated cores: no eligible CPU -* when cpuset cores are not in isolated core list: no eligible CPU -* when cpuset cores are all isolated cores: return the first CPU of the cpuset -* when cpuset cores are mixed between housekeeping/isolated cores: return the - first housekeeping CPU not in isolated CPUs. - -The returned CPU core is then used to set the `runc init` CPU affinity before -the container cgroup cpuset transition. - -#### Transition example - -`nohz_full` has the isolated cores `4-7`. A container has been created with -the cgroup cpuset `4-7` to only run on the isolated CPU cores 4 to 7. -`runc exec` is called by a process with CPU affinity set to `0-3` - -* with `temporary` transition: - - runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4-7) - -* with `definitive` transition: - - runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4) - -The difference between `temporary` and `definitive` is the container process -affinity, `definitive` will constraint the container process to run on the -first isolated CPU core of the cgroup cpuset, while `temporary` restore the -CPU affinity to match the container cgroup cpuset. - -`definitive` transition might be helpful when `nohz_full` is used without -`isolcpus` to avoid runc and container process to be a noisy neighbour for -real-time applications. - -### How to use it with Kubernetes? - -Kubernetes doesn't manage container directly, instead it uses the Container Runtime -Interface (CRI) to communicate with a software implementing this interface and responsible -to manage the lifecycle of containers. There are popular CRI implementations like Containerd -and CRI-O. Those implementations allows to pass pod annotations to the container runtime -via the container runtime spec. Currently runc is the runtime used by default for both. - -#### Containerd configuration - -Containerd CRI uses runc by default but requires an extra step to pass the annotation to runc. -You have to whitelist `org.opencontainers.runc.exec.isolated-cpu-affinity-transition` as a pod -annotation allowed to be passed to the container runtime in `/etc/containerd/config.toml`: - -```toml -[plugins."io.containerd.grpc.v1.cri".containerd] - default_runtime_name = "runc" - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] - runtime_type = "io.containerd.runc.v2" - base_runtime_spec = "/etc/containerd/cri-base.json" - pod_annotations = ["org.opencontainers.runc.exec.isolated-cpu-affinity-transition"] -``` - -#### CRI-O configuration - -CRI-O doesn't require any extra step, however some annotations could be excluded by -configuration. - -#### Pod deployment example - -```yaml -apiVersion: v1 -kind: Pod -metadata: - name: demo-pod - annotations: - org.opencontainers.runc.exec.isolated-cpu-affinity-transition: "temporary" -spec: - containers: - - name: demo - image: registry.com/demo:latest -``` diff --git a/features.go b/features.go index 190b7d10422..eff04c1b2d2 100644 --- a/features.go +++ b/features.go @@ -68,7 +68,6 @@ var featuresCommand = cli.Command{ "bundle", "org.systemd.property.", // prefix form "org.criu.config", - "org.opencontainers.runc.exec.isolated-cpu-affinity-transition", }, } diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index fa72f2eac2d..811f2d26e00 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -72,8 +72,4 @@ type Manager interface { // OOMKillCount reports OOM kill count for the cgroup. OOMKillCount() (uint64, error) - - // GetEffectiveCPUs returns the effective CPUs of the cgroup, an empty - // value means that the cgroups cpuset subsystem/controller is not enabled. - GetEffectiveCPUs() string } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index 723d18b7637..d2decb127ca 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -4,8 +4,6 @@ import ( "errors" "fmt" "os" - "path/filepath" - "strings" "sync" "golang.org/x/sys/unix" @@ -265,28 +263,3 @@ func (m *Manager) OOMKillCount() (uint64, error) { return c, err } - -func (m *Manager) GetEffectiveCPUs() string { - return GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) -} - -func GetEffectiveCPUs(cpusetPath string, cgroups *configs.Cgroup) string { - // Fast path. - if cgroups.CpusetCpus != "" { - return cgroups.CpusetCpus - } else if !strings.HasPrefix(cpusetPath, defaultCgroupRoot) { - return "" - } - - // Iterates until it goes to the cgroup root path. - // It's required for containers in which cpuset controller - // is not enabled, in this case a parent cgroup is used. - for path := cpusetPath; path != defaultCgroupRoot; path = filepath.Dir(path) { - cpus, err := fscommon.GetCgroupParamString(path, "cpuset.effective_cpus") - if err == nil { - return cpus - } - } - - return "" -} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 0a579b23e84..0760be74b97 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -4,13 +4,11 @@ import ( "errors" "fmt" "os" - "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" ) type parseError = fscommon.ParseError @@ -34,9 +32,6 @@ func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) { if err != nil { return nil, err } - } else { - // Clean path for safety. - dirPath = utils.CleanPath(dirPath) } m := &Manager{ @@ -321,26 +316,3 @@ func CheckMemoryUsage(dirPath string, r *configs.Resources) error { return nil } - -func (m *Manager) GetEffectiveCPUs() string { - // Fast path. - if m.config.CpusetCpus != "" { - return m.config.CpusetCpus - } else if !strings.HasPrefix(m.dirPath, UnifiedMountpoint) { - return "" - } - - // Iterates until it goes outside of the cgroup root path. - // It's required for containers in which cpuset controller - // is not enabled, in this case a parent cgroup is used. - outsidePath := filepath.Dir(UnifiedMountpoint) - - for path := m.dirPath; path != outsidePath; path = filepath.Dir(path) { - cpus, err := fscommon.GetCgroupParamString(path, "cpuset.cpus.effective") - if err == nil { - return cpus - } - } - - return "" -} diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index e04e35682cd..8c64a5887a9 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -411,7 +411,3 @@ func (m *LegacyManager) Exists() bool { func (m *LegacyManager) OOMKillCount() (uint64, error) { return fs.OOMKillCount(m.Path("memory")) } - -func (m *LegacyManager) GetEffectiveCPUs() string { - return fs.GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) -} diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 2c4a8c4c3ac..b28ec6b22f2 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -514,7 +514,3 @@ func (m *UnifiedManager) Exists() bool { func (m *UnifiedManager) OOMKillCount() (uint64, error) { return m.fsMgr.OOMKillCount() } - -func (m *UnifiedManager) GetEffectiveCPUs() string { - return m.fsMgr.GetEffectiveCPUs() -} diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go index 186cbc6413f..bf2ddd2e6b8 100644 --- a/libcontainer/cgroups/utils.go +++ b/libcontainer/cgroups/utils.go @@ -136,18 +136,18 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -func readProcsFile(dir string) ([]int, error) { - f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY) +func readProcsFile(dir string) (out []int, _ error) { + file := CgroupProcesses + retry := true + +again: + f, err := OpenFile(dir, file, os.O_RDONLY) if err != nil { return nil, err } defer f.Close() - var ( - s = bufio.NewScanner(f) - out = []int{} - ) - + s := bufio.NewScanner(f) for s.Scan() { if t := s.Text(); t != "" { pid, err := strconv.Atoi(t) @@ -157,6 +157,13 @@ func readProcsFile(dir string) ([]int, error) { out = append(out, pid) } } + if errors.Is(s.Err(), unix.ENOTSUP) && retry { + // For a threaded cgroup, read returns ENOTSUP, and we should + // read from cgroup.threads instead. + file = "cgroup.threads" + retry = false + goto again + } return out, s.Err() } diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index c76178d808a..99908376562 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -69,10 +69,6 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { return configs.Thawed, nil } -func (m *mockCgroupManager) GetEffectiveCPUs() string { - return "" -} - type mockProcess struct { _pid int started uint64 diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index be824962640..3c3e797661a 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,13 +1,11 @@ package libcontainer import ( - "bytes" "context" "encoding/json" "errors" "fmt" "io" - "io/fs" "net" "os" "os/exec" @@ -23,12 +21,10 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/runc/libcontainer/system/kernelparam" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -137,58 +133,8 @@ func (p *setnsProcess) start() (retErr error) { // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() - - // When greater or equal to zero, it will set a temporary single CPU - // affinity before cgroup cpuset transition, this handles a corner - // case when joining a container having all the processes running - // exclusively on isolated CPU cores to force the kernel to schedule - // runc process on the first CPU core within the cgroups cpuset. - // The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 - // in 5.7 has affected this deterministic scheduling behavior by - // distributing tasks across CPU cores within the cgroups cpuset. - // Some intensive real-time application are relying on this - // deterministic behavior and use the first CPU core to run a slow - // thread while other CPU cores are fully used by real-time threads - // with SCHED_FIFO policy. Such applications prevent runc process - // from joining a container when the runc process is randomly - // scheduled on a CPU core owned by a real-time thread. - cpuAffinity := -1 - resetCPUAffinity := true - - if len(p.manager.GetPaths()) > 0 { - // Get the target container cgroup. - if cg, err := p.manager.GetCgroups(); err != nil { - // Close the pipe to not be blocked in the parent. - p.comm.closeChild() - return fmt.Errorf("getting container cgroups: %w", err) - } else if cg.CpusetCpus != "" { - definitive := false - - _, annotations := utils.Annotations(p.config.Config.Labels) - cpuAffinity, definitive, err = isolatedCPUAffinityTransition( - os.DirFS("/"), - cg.CpusetCpus, - annotations, - ) - if err != nil { - // Close the pipe to not be blocked in the parent. - p.comm.closeChild() - return fmt.Errorf("getting CPU affinity: %w", err) - } else if definitive { - resetCPUAffinity = false - } - } - } - - var err error - - if cpuAffinity < 0 { - err = p.cmd.Start() - } else { - err = startCommandWithCPUAffinity(p.cmd, cpuAffinity) - } - - // Close the write-side of the pipes (controlled by child). + err := p.cmd.Start() + // close the child-side of the pipes (controlled by child) p.comm.closeChild() if err != nil { return fmt.Errorf("error starting setns process: %w", err) @@ -247,18 +193,6 @@ func (p *setnsProcess) start() (retErr error) { } } } - - if resetCPUAffinity { - // Fix the container process CPU affinity to match container cgroup cpuset, - // since kernel 6.2, the runc CPU affinity might affect the container process - // CPU affinity after cgroup cpuset transition, by example if runc is running - // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the - // resulting container process CPU affinity will be 1 instead of 1-2. - if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { - return fmt.Errorf("error resetting container process CPU affinity: %w", err) - } - } - if p.intelRdtPath != "" { // if Intel RDT "resource control" filesystem path exists _, err := os.Stat(p.intelRdtPath) @@ -819,14 +753,6 @@ func (p *initProcess) start() (retErr error) { if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) } - // Reset container process CPU affinity to match container cgroup cpuset, - // since kernel 6.2, the runc CPU affinity might affect the container process - // CPU affinity after cgroup cpuset transition, by example if runc is running - // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the - // resulting container process CPU affinity will be 1 instead of 1-2. - if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { - return fmt.Errorf("error resetting container process CPU affinity: %w", err) - } if p.intelRdtManager != nil { if err := p.intelRdtManager.Set(p.config.Config); err != nil { return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) @@ -1078,196 +1004,5 @@ func setIOPriority(ioprio *configs.IOPriority) error { if errno != 0 { return fmt.Errorf("failed to set io priority: %w", errno) } - - return nil -} - -// isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics -// and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value. -func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) { - const ( - isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" - nohzFullParam = "nohz_full" - ) - - definitive := false - - transition := annotations[isolatedCPUAffinityTransitionAnnotation] - switch transition { - case "temporary": - case "definitive": - definitive = true - default: - if transition != "" { - return -1, false, fmt.Errorf( - "unknown transition value %q for annotation %s", - transition, isolatedCPUAffinityTransitionAnnotation, - ) - } - return -1, false, nil - } - - kernelParams, err := kernelparam.LookupKernelBootParameters( - rootFS, - nohzFullParam, - ) - if err != nil { - // If /proc/cmdline does not exist or isn't readable, continue to read - // nohz_full from sysfs below. - if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) { - return -1, false, err - } - } - - // First get nohz_full value from kernel boot params, if not - // present, get the value from sysfs, to cover the case where - // CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration - // tests not dependent on /sys/devices/system/cpu/nohz_full. - isolatedList := kernelParams[nohzFullParam] - if isolatedList == "" { - // Get the isolated CPU list, the error is not checked here because - // no matter what the error is, it returns without error the same way - // as with empty data. - isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full") - isolatedList = string(bytes.TrimSpace(isolatedData)) - if isolatedList == "" || isolatedList == "(null)" { - return -1, false, nil - } - } - - cpu, err := getEligibleCPU(cpusetList, isolatedList) - if err != nil { - return -1, false, fmt.Errorf("getting eligible cpu: %w", err) - } else if cpu == -1 { - definitive = false - } - - return cpu, definitive, nil -} - -// getEligibleCPU returns the first eligible CPU for CPU affinity before -// entering in a cgroup cpuset: -// - when there is not cpuset cores: no eligible CPU (-1) -// - when there is not isolated cores: no eligible CPU (-1) -// - when cpuset cores are not in isolated cores: no eligible CPU (-1) -// - when cpuset cores are all isolated cores: return the first CPU of the cpuset -// - when cpuset cores are mixed between housekeeping/isolated cores: return the -// first housekeeping CPU not in isolated CPUs. -func getEligibleCPU(cpusetList, isolatedList string) (int, error) { - if isolatedList == "" || cpusetList == "" { - return -1, nil - } - - // The target container has a cgroup cpuset, get the bit range. - cpusetBits, err := systemd.RangeToBits(cpusetList) - if err != nil { - return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) - } - - isolatedBits, err := systemd.RangeToBits(isolatedList) - if err != nil { - return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) - } - - eligibleCore := -1 - isolatedCores := 0 - - // Start from cpu core #0. - currentCore := 0 - // Handle mixed sets. - mixed := false - - // CPU core start from the first slice element and bits are read - // from the least to the most significant bit. - for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { - if byteRange >= len(isolatedBits) { - // No more isolated cores. - break - } - for bit := 0; bit < 8; bit++ { - if cpusetBits[byteRange]&(1< 0 { - return eligibleCore, nil - } - } - currentCore++ - } - } - - // We have an eligible CPU if there is at least one isolated CPU in the cpuset. - if isolatedCores == 0 { - return -1, nil - } - - return eligibleCore, nil -} - -// startCommandWithCPUAffinity starts a command on a specific CPU if set. -func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error { - errCh := make(chan error) - defer close(errCh) - - // Use a goroutine to dedicate an OS thread. - go func() { - cpuSet := new(unix.CPUSet) - cpuSet.Zero() - cpuSet.Set(cpuAffinity) - - // Don't call runtime.UnlockOSThread to terminate the OS thread - // when goroutine exits. - runtime.LockOSThread() - - // Command inherits the CPU affinity. - if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil { - errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err) - return - } - - errCh <- cmd.Start() - }() - - return <-errCh -} - -// fixProcessCPUAffinity sets the CPU affinity of a container process -// to all CPUs allowed by container cgroup cpuset. -func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error { - cpusetList := manager.GetEffectiveCPUs() - if cpusetList == "" { - // If the cgroup cpuset is not present, the container will inherit - // this process CPU affinity, so it can return without further actions. - return nil - } - - cpusetBits, err := systemd.RangeToBits(cpusetList) - if err != nil { - return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) - } - - processCPUSet := new(unix.CPUSet) - - for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { - for bit := 0; bit < 8; bit++ { - processCPUSet.Set(byteRange*8 + bit) - } - } - - if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil { - return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err) - } - return nil } diff --git a/libcontainer/process_linux_test.go b/libcontainer/process_linux_test.go deleted file mode 100644 index 8303643967b..00000000000 --- a/libcontainer/process_linux_test.go +++ /dev/null @@ -1,232 +0,0 @@ -package libcontainer - -import ( - "io/fs" - "testing" - "testing/fstest" -) - -func TestIsolatedCPUAffinityTransition(t *testing.T) { - const isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" - - noAffinity := -1 - temporaryTransition := "temporary" - definitiveTransition := "definitive" - - tests := []struct { - name string - testFS fs.FS - cpuset string - expectedErr bool - expectedAffinityCore int - expectedDefinitiveTransition bool - annotations map[string]string - }{ - { - name: "no affinity", - cpuset: "0-15", - testFS: fstest.MapFS{ - "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, - }, - expectedAffinityCore: noAffinity, - expectedDefinitiveTransition: false, - }, - { - name: "affinity match with temporary transition", - cpuset: "3-4", - testFS: fstest.MapFS{ - "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, - }, - expectedAffinityCore: 3, - expectedDefinitiveTransition: false, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: temporaryTransition, - }, - }, - { - name: "affinity match with temporary transition and nohz_full boot param", - cpuset: "3-4", - testFS: fstest.MapFS{ - "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, - }, - expectedAffinityCore: 3, - expectedDefinitiveTransition: false, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: temporaryTransition, - }, - }, - { - name: "affinity match with definitive transition", - cpuset: "3-4", - testFS: fstest.MapFS{ - "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, - }, - expectedAffinityCore: 3, - expectedDefinitiveTransition: true, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: definitiveTransition, - }, - }, - { - name: "affinity match with definitive transition and nohz_full boot param", - cpuset: "3-4", - testFS: fstest.MapFS{ - "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, - }, - expectedAffinityCore: 3, - expectedDefinitiveTransition: true, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: definitiveTransition, - }, - }, - { - name: "affinity error with bad isolated set", - cpuset: "0-15", - testFS: fstest.MapFS{ - "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("bad_isolated_set\n")}, - }, - expectedErr: true, - expectedAffinityCore: noAffinity, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: temporaryTransition, - }, - }, - { - name: "affinity error with bad isolated set for nohz_full boot param", - cpuset: "0-15", - testFS: fstest.MapFS{ - "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=bad_isolated_set\n")}, - }, - expectedErr: true, - expectedAffinityCore: noAffinity, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: temporaryTransition, - }, - }, - { - name: "no affinity with null isolated set value", - cpuset: "0-15", - testFS: fstest.MapFS{ - "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("(null)\n")}, - }, - expectedAffinityCore: noAffinity, - expectedDefinitiveTransition: false, - annotations: map[string]string{ - isolatedCPUAffinityTransitionAnnotation: temporaryTransition, - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - affinityCore, definitive, err := isolatedCPUAffinityTransition(tt.testFS, tt.cpuset, tt.annotations) - if err != nil && !tt.expectedErr { - t.Fatalf("unexpected error: %s", err) - } else if err == nil && tt.expectedErr { - t.Fatalf("unexpected success") - } else if tt.expectedDefinitiveTransition != definitive { - t.Fatalf("expected reset affinity %t: got %t instead", tt.expectedDefinitiveTransition, definitive) - } else if tt.expectedAffinityCore != affinityCore { - t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) - } - }) - } -} - -func TestGetEligibleCPU(t *testing.T) { - tests := []struct { - name string - cpuset string - isolset string - expectedErr bool - expectedAffinityCore int - expectedEligible bool - }{ - { - name: "no cpuset", - isolset: "2-15,18-31,34-47", - expectedEligible: false, - }, - { - name: "no isolated set", - cpuset: "0-15", - expectedEligible: false, - }, - { - name: "bad cpuset format", - cpuset: "core0 to core15", - isolset: "2-15,18-31,34-47", - expectedErr: true, - }, - { - name: "bad isolated set format", - cpuset: "0-15", - isolset: "core0 to core15", - expectedErr: true, - }, - { - name: "no eligible core", - cpuset: "0-1,16-17,32-33", - isolset: "2-15,18-31,34-47", - expectedEligible: false, - }, - { - name: "no eligible core inverted", - cpuset: "2-15,18-31,34-47", - isolset: "0-1,16-17,32-33", - expectedEligible: false, - }, - { - name: "eligible core mixed", - cpuset: "8-31", - isolset: "2-15,18-31,34-47", - expectedEligible: true, - expectedAffinityCore: 16, - }, - { - name: "eligible core #4", - cpuset: "4-7", - isolset: "2-15,18-31,34-47", - expectedEligible: true, - expectedAffinityCore: 4, - }, - { - name: "eligible core #40", - cpuset: "40-47", - isolset: "2-15,18-31,34-47", - expectedEligible: true, - expectedAffinityCore: 40, - }, - { - name: "eligible core #24", - cpuset: "24-31", - isolset: "2-15,18-31,34-47", - expectedEligible: true, - expectedAffinityCore: 24, - }, - { - name: "no eligible core small isolated set", - cpuset: "60-63", - isolset: "0-1", - expectedEligible: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - affinityCore, err := getEligibleCPU(tt.cpuset, tt.isolset) - eligible := affinityCore >= 0 - if err != nil && !tt.expectedErr { - t.Fatalf("unexpected error: %s", err) - } else if err == nil && tt.expectedErr { - t.Fatalf("unexpected success") - } else if tt.expectedEligible && !eligible { - t.Fatalf("was expecting eligible core but no eligible core returned") - } else if !tt.expectedEligible && eligible { - t.Fatalf("was not expecting eligible core but got eligible core") - } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { - t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) - } - }) - } -} diff --git a/libcontainer/system/kernelparam/lookup.go b/libcontainer/system/kernelparam/lookup.go deleted file mode 100644 index 4cf452412ff..00000000000 --- a/libcontainer/system/kernelparam/lookup.go +++ /dev/null @@ -1,41 +0,0 @@ -package kernelparam - -import ( - "io/fs" - "strings" -) - -func runeFilter(c rune) bool { - return c < '!' || c > '~' -} - -// LookupKernelBootParameters returns the selected kernel parameters specified -// in the kernel command line. The parameters are returned as a map of key-value pairs. -func LookupKernelBootParameters(rootFS fs.FS, lookupParameters ...string) (map[string]string, error) { - cmdline, err := fs.ReadFile(rootFS, "proc/cmdline") - if err != nil { - return nil, err - } - - kernelParameters := make(map[string]string) - remaining := len(lookupParameters) - - for _, parameter := range strings.FieldsFunc(string(cmdline), runeFilter) { - if remaining == 0 { - break - } - idx := strings.IndexByte(parameter, '=') - if idx == -1 { - continue - } - for _, lookupParam := range lookupParameters { - if lookupParam == parameter[:idx] { - kernelParameters[lookupParam] = parameter[idx+1:] - remaining-- - break - } - } - } - - return kernelParameters, nil -} diff --git a/libcontainer/system/kernelparam/lookup_test.go b/libcontainer/system/kernelparam/lookup_test.go deleted file mode 100644 index 9d906301eb4..00000000000 --- a/libcontainer/system/kernelparam/lookup_test.go +++ /dev/null @@ -1,60 +0,0 @@ -package kernelparam - -import ( - "testing" - "testing/fstest" -) - -func TestLookupKernelBootParameters(t *testing.T) { - for _, test := range []struct { - cmdline string - lookupParameters []string - expectedKernelParameters map[string]string - }{ - { - cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", - lookupParameters: []string{"root"}, - expectedKernelParameters: map[string]string{ - "root": "/dev/sda1", - }, - }, - { - cmdline: "ro runc.kernel_parameter=a_value console=ttyS0 console=tty0", - lookupParameters: []string{"runc.kernel_parameter"}, - expectedKernelParameters: map[string]string{ - "runc.kernel_parameter": "a_value", - }, - }, - { - cmdline: "ro runc.kernel_parameter_a=value_a runc.kernel_parameter_b=value_a:value_b", - lookupParameters: []string{ - "runc.kernel_parameter_a", - "runc.kernel_parameter_b", - }, - expectedKernelParameters: map[string]string{ - "runc.kernel_parameter_a": "value_a", - "runc.kernel_parameter_b": "value_a:value_b", - }, - }, - { - cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", - lookupParameters: []string{"runc.kernel_parameter_a"}, - expectedKernelParameters: map[string]string{}, - }, - } { - params, err := LookupKernelBootParameters(fstest.MapFS{ - "proc/cmdline": &fstest.MapFile{Data: []byte(test.cmdline + "\n")}, - }, test.lookupParameters...) - if err != nil { - t.Fatalf("unexpected error: %s", err) - } - if len(params) != len(test.expectedKernelParameters) { - t.Fatalf("expected %d parameters, got %d", len(test.expectedKernelParameters), len(params)) - } - for k, v := range test.expectedKernelParameters { - if params[k] != v { - t.Fatalf("expected parameter %s to be %s, got %s", k, v, params[k]) - } - } - } -} diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index c0ea7f5dce6..a92a237809d 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -340,139 +340,3 @@ EOF [ ${#lines[@]} -eq 1 ] [[ ${lines[0]} = *"exec /run.sh: no such file or directory"* ]] } - -@test "runc exec with isolated cpus affinity temporary transition [cgroup cpuset]" { - requires root cgroups_cpuset - - tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") - - set_cgroup_cpuset_all_cpus - local all_cpus - all_cpus="$(get_all_online_cpus)" - - # set temporary isolated CPU affinity transition - update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary"}' - - runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_temporary_transition - [ "$status" -eq 0 ] - - # set all online cpus as isolated - echo "nohz_full=$all_cpus" >"$tmp/cmdline" - - mount --bind "$tmp/cmdline" /proc/cmdline - - runc exec test_isolated_temporary_transition grep "Cpus_allowed_list:" /proc/self/status - - umount /proc/cmdline - - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] -} - -@test "runc exec with isolated cpus affinity definitive transition [cgroup cpuset]" { - requires root cgroups_cpuset - - tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") - - set_cgroup_cpuset_all_cpus - local all_cpus - all_cpus="$(get_all_online_cpus)" - - # set definitive isolated CPU affinity transition - update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "definitive"}' - - runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_definitive_transition - [ "$status" -eq 0 ] - - # set all online cpus as isolated - echo "nohz_full=$all_cpus" >"$tmp/cmdline" - - mount --bind "$tmp/cmdline" /proc/cmdline - - runc exec test_isolated_definitive_transition grep "Cpus_allowed_list:" /proc/self/status - - umount /proc/cmdline - - [ "$status" -eq 0 ] - - load /etc/os-release - - # fix unbound variable in condition below - VERSION_ID=${VERSION_ID:-} - - allowed_cpus=$all_cpus - # use first cpu on systems with RHEL >= 9 or systems with kernel >= 6.2 - if [[ "${ID_LIKE:-}" =~ "rhel" && "${VERSION_ID%%.*}" -ge "9" ]] || is_kernel_gte 6.2; then - allowed_cpus="$(get_first_online_cpu)" - fi - - [[ "${lines[0]}" == "Cpus_allowed_list: $allowed_cpus" ]] -} - -@test "runc exec with isolated cpus affinity bad transition [cgroup cpuset]" { - requires root cgroups_cpuset - - tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") - - set_cgroup_cpuset_all_cpus - local all_cpus - all_cpus="$(get_all_online_cpus)" - - # set a bad isolated CPU affinity transition - update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "bad"}' - - runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_bad_transition - [ "$status" -eq 0 ] - - # set all online cpus as isolated - echo "nohz_full=$all_cpus" >"$tmp/cmdline" - - mount --bind "$tmp/cmdline" /proc/cmdline - - runc exec test_isolated_bad_transition true - - umount /proc/cmdline - - [ "$status" -eq 255 ] -} - -@test "runc exec with taskset affinity [cgroup cpuset]" { - requires root cgroups_cpuset - - set_cgroup_cpuset_all_cpus - local all_cpus - all_cpus="$(get_all_online_cpus)" - - taskset -p -c "$(get_first_online_cpu)" $$ - - runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset - [ "$status" -eq 0 ] - - runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] - - runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] -} - -@test "runc exec with taskset affinity [rootless cgroups_v2]" { - requires rootless cgroups_v2 - - local all_cpus - all_cpus="$(get_all_online_cpus)" - - taskset -p -c "$(get_first_online_cpu)" $$ - - runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset - [ "$status" -eq 0 ] - - runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] - - runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] -} diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index e1d2a80af84..8e9c37f4b5b 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -354,27 +354,6 @@ function set_cgroup_mount_writable() { update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' } -# Helper function to get all online cpus. -function get_all_online_cpus() { - cat /sys/devices/system/cpu/online -} - -# Helper function to get the first online cpu. -function get_first_online_cpu() { - [[ $(get_all_online_cpus) =~ [^0-9]*([0-9]+)([-,][0-9]+)? ]] && echo "${BASH_REMATCH[1]}" -} - -# Helper function to set all cpus/mems in container cgroup cpuset. -function set_cgroup_cpuset_all_cpus() { - update_config ".linux.resources.cpu.cpus = \"$(get_all_online_cpus)\"" - - local mems - mems="$(cat /sys/devices/system/node/online 2>/dev/null || true)" - if [[ -n $mems ]]; then - update_config ".linux.resources.cpu.mems = \"$mems\"" - fi -} - # Fails the current test, providing the error given. function fail() { echo "$@" >&2