diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index eac17027af8..a7a38bbe404 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -28,6 +28,7 @@ import ( "google.golang.org/protobuf/proto" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" @@ -2246,6 +2247,34 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } + // set CPU affinity + if it == initSetns && len(c.cgroupManager.GetPaths()) > 0 { + // get the target container cgroup + if cg, err := c.cgroupManager.GetCgroups(); err != nil { + return nil, fmt.Errorf("getting container cgroups: %w", err) + } else if cg.CpusetCpus != "" { + // get the isolated CPU list + d, err := os.ReadFile("/sys/devices/system/cpu/isolated") + // The above file may not be available in some environment + // due to /sys not being mounted, if not present we don't + // try to set CPU affinity and ignore the error. + // When an empty set is returned, the data length is equal + // to 1 (newline char), when set its length is greater than 1 + // which means we may need to adjust CPU affinity shortly. + if err == nil && len(d) > 1 { + cpu, eligible, err := getEligibleCPU(cg.CpusetCpus, string(bytes.TrimSpace(d))) + if err != nil { + return nil, fmt.Errorf("getting eligible cpu: %w", err) + } else if eligible { + r.AddData(&Int32msg{ + Type: CPUAffinityAttr, + Value: uint32(cpu), + }) + } + } + } + } + return bytes.NewReader(r.Serialize()), nil } @@ -2280,3 +2309,57 @@ func requiresRootOrMappingTool(c *configs.Config) bool { } return !reflect.DeepEqual(c.GidMappings, gidMap) } + +// getEligibleCPU returns the first eligible CPU for CPU affinity before +// entering in a cgroup cpuset. +// - when there is not cpuset cores: no eligible CPU +// - when there is not isolated cores: no eligible CPU +// - when cpuset cores are all isolated cores: return first CPU of the cpuset +// - when cpuset cores are mixed between housekeeping/isolated cores: no eligible CPU. +func getEligibleCPU(cpusetList, isolatedList string) (int, bool, error) { + if isolatedList == "" || cpusetList == "" { + return 0, false, nil + } + + // The target container has a cgroup cpuset, get the bit range. + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return 0, false, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + isolatedBits, err := systemd.RangeToBits(isolatedList) + if err != nil { + return 0, false, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) + } + + affinityCore := 0 + isolatedCores := 0 + cpusetCores := 0 + + // start from cpu core #0 + currentCore := 0 + // CPU core start from the first slice element and bits are read + // from the least to the most significant bit. + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + for bit := 0; bit < 8; bit++ { + if cpusetBits[byteRange]&(1< 0 && isolatedCores == cpusetCores, nil +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 6551de8085f..a5d25d62b6b 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -286,3 +286,87 @@ func TestGetContainerStateAfterUpdate(t *testing.T) { t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory) } } + +func TestGetEligibleCPU(t *testing.T) { + tests := []struct { + name string + cpuset string + isolset string + expectedErr bool + expectedAffinityCore int + expectedEligible bool + }{ + { + name: "no cpuset", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no isolated set", + cpuset: "0-15", + expectedEligible: false, + }, + { + name: "bad cpuset format", + cpuset: "core0 to core15", + isolset: "2-15,18-31,34-47", + expectedErr: true, + }, + { + name: "bad isolated set format", + cpuset: "0-15", + isolset: "core0 to core15", + expectedErr: true, + }, + { + name: "no eligible core", + cpuset: "0-1,16-17,32-33", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no eligible core mixed", + cpuset: "0-31", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "eligible core #4", + cpuset: "4-7", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 4, + }, + { + name: "eligible core #40", + cpuset: "40-47", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 40, + }, + { + name: "eligible core #24", + cpuset: "24-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 24, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, eligible, err := getEligibleCPU(tt.cpuset, tt.isolset) + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedEligible && !eligible { + t.Fatalf("was expecting eligible core but no eligible core returned") + } else if !tt.expectedEligible && eligible { + t.Fatalf("was not expecting eligible core but got eligible core") + } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..4cdd329191b 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,7 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + CPUAffinityAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 748791d6b82..577d8e746c2 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -95,6 +95,9 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* Temporary CPU affinity before cgroup cpuset transition. */ + uint32_t cpu_affinity; }; /* @@ -112,6 +115,7 @@ struct nlconfig_t { #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define CPU_AFFINITY_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -383,6 +387,9 @@ static void nl_parse(int fd, struct nlconfig_t *config) if (len != size) bail("failed to read netlink payload, %zu != %zu", len, size); + /* No cpu affinity by default: int32(-1) */ + config->cpu_affinity = ~0; + /* Parse the netlink payload. */ config->data = data; while (current < data + size) { @@ -431,6 +438,9 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case CPU_AFFINITY_ATTR: + config->cpu_affinity = readint32(current); + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -1053,6 +1063,29 @@ void nsexec(void) bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); } + /* + * Set temporary single CPU affinity before cgroup cpuset transition, + * this handles a corner case when joining a container having all + * the processes running exclusively on isolated CPU cores to force + * the kernel to schedule runc process on the first CPU core within the + * cgroups cpuset. The introduction of the kernel commit + * 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic + * scheduling behavior by distributing tasks across CPU cores within the + * cgroups cpuset. Some intensive real-time application are relying on this + * deterministic behavior and use the first CPU core to run a slow thread + * while other CPU cores are fully used by real-time threads with SCHED_FIFO + * policy. Such applications prevent runc process from joining a container + * when the runc process is randomly scheduled on a CPU core owned by a + * real-time thread. + */ + if ((int32_t) config.cpu_affinity >= 0) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(config.cpu_affinity, &set); + if (sched_setaffinity(0, sizeof(set), &set) == -1) + bail("sched_setaffinity failed"); + } + /* * TODO: What about non-namespace clone flags that we're dropping here? *