From 52fd5d82fe5051062f862634b4198c528b178fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Clerget?= Date: Fri, 30 Jun 2023 15:49:47 +0200 Subject: [PATCH] Set temporary single CPU affinity before cgroup cpuset transition. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This handles a corner case when joining a container having all the processes running exclusively on isolated CPU cores to force the kernel to schedule runc process on the first CPU core within the cgroups cpuset. The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic scheduling behavior by distributing tasks across CPU cores within the cgroups cpuset. Some intensive real-time application are relying on this deterministic behavior and use the first CPU core to run a slow thread while other CPU cores are fully used by real-time threads with SCHED_FIFO policy. Such applications prevents runc process from joining a container when the runc process is randomly scheduled on a CPU core owned by a real-time thread. Signed-off-by: Cédric Clerget --- libcontainer/container_linux.go | 83 +++++++++++++++++++++++++++ libcontainer/container_linux_test.go | 84 ++++++++++++++++++++++++++++ libcontainer/message_linux.go | 1 + libcontainer/nsenter/nsexec.c | 33 +++++++++++ 4 files changed, 201 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index eac17027af8..a7a38bbe404 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -28,6 +28,7 @@ import ( "google.golang.org/protobuf/proto" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" @@ -2246,6 +2247,34 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } + // set CPU affinity + if it == initSetns && len(c.cgroupManager.GetPaths()) > 0 { + // get the target container cgroup + if cg, err := c.cgroupManager.GetCgroups(); err != nil { + return nil, fmt.Errorf("getting container cgroups: %w", err) + } else if cg.CpusetCpus != "" { + // get the isolated CPU list + d, err := os.ReadFile("/sys/devices/system/cpu/isolated") + // The above file may not be available in some environment + // due to /sys not being mounted, if not present we don't + // try to set CPU affinity and ignore the error. + // When an empty set is returned, the data length is equal + // to 1 (newline char), when set its length is greater than 1 + // which means we may need to adjust CPU affinity shortly. + if err == nil && len(d) > 1 { + cpu, eligible, err := getEligibleCPU(cg.CpusetCpus, string(bytes.TrimSpace(d))) + if err != nil { + return nil, fmt.Errorf("getting eligible cpu: %w", err) + } else if eligible { + r.AddData(&Int32msg{ + Type: CPUAffinityAttr, + Value: uint32(cpu), + }) + } + } + } + } + return bytes.NewReader(r.Serialize()), nil } @@ -2280,3 +2309,57 @@ func requiresRootOrMappingTool(c *configs.Config) bool { } return !reflect.DeepEqual(c.GidMappings, gidMap) } + +// getEligibleCPU returns the first eligible CPU for CPU affinity before +// entering in a cgroup cpuset. +// - when there is not cpuset cores: no eligible CPU +// - when there is not isolated cores: no eligible CPU +// - when cpuset cores are all isolated cores: return first CPU of the cpuset +// - when cpuset cores are mixed between housekeeping/isolated cores: no eligible CPU. +func getEligibleCPU(cpusetList, isolatedList string) (int, bool, error) { + if isolatedList == "" || cpusetList == "" { + return 0, false, nil + } + + // The target container has a cgroup cpuset, get the bit range. + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return 0, false, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + isolatedBits, err := systemd.RangeToBits(isolatedList) + if err != nil { + return 0, false, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) + } + + affinityCore := 0 + isolatedCores := 0 + cpusetCores := 0 + + // start from cpu core #0 + currentCore := 0 + // CPU core start from the first slice element and bits are read + // from the least to the most significant bit. + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + for bit := 0; bit < 8; bit++ { + if cpusetBits[byteRange]&(1< 0 && isolatedCores == cpusetCores, nil +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 6551de8085f..a5d25d62b6b 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -286,3 +286,87 @@ func TestGetContainerStateAfterUpdate(t *testing.T) { t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory) } } + +func TestGetEligibleCPU(t *testing.T) { + tests := []struct { + name string + cpuset string + isolset string + expectedErr bool + expectedAffinityCore int + expectedEligible bool + }{ + { + name: "no cpuset", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no isolated set", + cpuset: "0-15", + expectedEligible: false, + }, + { + name: "bad cpuset format", + cpuset: "core0 to core15", + isolset: "2-15,18-31,34-47", + expectedErr: true, + }, + { + name: "bad isolated set format", + cpuset: "0-15", + isolset: "core0 to core15", + expectedErr: true, + }, + { + name: "no eligible core", + cpuset: "0-1,16-17,32-33", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no eligible core mixed", + cpuset: "0-31", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "eligible core #4", + cpuset: "4-7", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 4, + }, + { + name: "eligible core #40", + cpuset: "40-47", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 40, + }, + { + name: "eligible core #24", + cpuset: "24-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 24, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, eligible, err := getEligibleCPU(tt.cpuset, tt.isolset) + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedEligible && !eligible { + t.Fatalf("was expecting eligible core but no eligible core returned") + } else if !tt.expectedEligible && eligible { + t.Fatalf("was not expecting eligible core but got eligible core") + } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..4cdd329191b 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,7 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + CPUAffinityAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 748791d6b82..577d8e746c2 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -95,6 +95,9 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* Temporary CPU affinity before cgroup cpuset transition. */ + uint32_t cpu_affinity; }; /* @@ -112,6 +115,7 @@ struct nlconfig_t { #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define CPU_AFFINITY_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -383,6 +387,9 @@ static void nl_parse(int fd, struct nlconfig_t *config) if (len != size) bail("failed to read netlink payload, %zu != %zu", len, size); + /* No cpu affinity by default: int32(-1) */ + config->cpu_affinity = ~0; + /* Parse the netlink payload. */ config->data = data; while (current < data + size) { @@ -431,6 +438,9 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case CPU_AFFINITY_ATTR: + config->cpu_affinity = readint32(current); + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -1053,6 +1063,29 @@ void nsexec(void) bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); } + /* + * Set temporary single CPU affinity before cgroup cpuset transition, + * this handles a corner case when joining a container having all + * the processes running exclusively on isolated CPU cores to force + * the kernel to schedule runc process on the first CPU core within the + * cgroups cpuset. The introduction of the kernel commit + * 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic + * scheduling behavior by distributing tasks across CPU cores within the + * cgroups cpuset. Some intensive real-time application are relying on this + * deterministic behavior and use the first CPU core to run a slow thread + * while other CPU cores are fully used by real-time threads with SCHED_FIFO + * policy. Such applications prevent runc process from joining a container + * when the runc process is randomly scheduled on a CPU core owned by a + * real-time thread. + */ + if ((int32_t) config.cpu_affinity >= 0) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(config.cpu_affinity, &set); + if (sched_setaffinity(0, sizeof(set), &set) == -1) + bail("sched_setaffinity failed"); + } + /* * TODO: What about non-namespace clone flags that we're dropping here? *