Skip to content

Commit

Permalink
Set temporary single CPU affinity before cgroup cpuset transition.
Browse files Browse the repository at this point in the history
This handles a corner case when joining a container having all
the processes running exclusively on isolated CPU cores to force
the kernel to schedule runc process on the first CPU core within the
cgroups cpuset.

The introduction of the kernel commit
46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic
scheduling behavior by distributing tasks across CPU cores within the
cgroups cpuset. Some intensive real-time application are relying on this
deterministic behavior and use the first CPU core to run a slow thread
while other CPU cores are fully used by real-time threads with SCHED_FIFO
policy. Such applications prevents runc process from joining a container
when the runc process is randomly scheduled on a CPU core owned by a
real-time thread.

Signed-off-by: Cédric Clerget <[email protected]>
  • Loading branch information
cclerget committed Jul 3, 2023
1 parent 164e4bc commit 85f2d35
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 0 deletions.
83 changes: 83 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"google.golang.org/protobuf/proto"

"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
Expand Down Expand Up @@ -2246,6 +2247,34 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
})
}

// set CPU affinity
if it == initSetns && len(c.cgroupManager.GetPaths()) > 0 {
// get the target container cgroup
if cg, err := c.cgroupManager.GetCgroups(); err != nil {
return nil, fmt.Errorf("getting container cgroups: %w", err)
} else if cg.CpusetCpus != "" {
// get the isolated CPU list
d, err := os.ReadFile("/sys/devices/system/cpu/isolated")
// The above file may not be available in some environment
// due to /sys not being mounted, if not present we don't
// try to set CPU affinity and ignore the error.
// When an empty set is returned, the data length is equal
// to 1 (newline char), when set its length is greater than 1
// which means we may need to adjust CPU affinity shortly.
if err == nil && len(d) > 1 {
cpu, eligible, err := getEligibleCPU(cg.CpusetCpus, string(bytes.TrimSpace(d)))
if err != nil {
return nil, fmt.Errorf("getting eligible cpu: %w", err)
} else if eligible {
r.AddData(&Int32msg{
Type: CPUAffinityAttr,
Value: uint32(cpu),
})
}
}
}
}

return bytes.NewReader(r.Serialize()), nil
}

Expand Down Expand Up @@ -2280,3 +2309,57 @@ func requiresRootOrMappingTool(c *configs.Config) bool {
}
return !reflect.DeepEqual(c.GidMappings, gidMap)
}

// getEligibleCPU returns the first eligible CPU for CPU affinity before
// entering in a cgroup cpuset.
// - when there is not cpuset cores: no eligible CPU
// - when there is not isolated cores: no eligible CPU
// - when cpuset cores are all isolated cores: return first CPU of the cpuset
// - when cpuset cores are mixed between housekeeping/isolated cores: no eligible CPU.
func getEligibleCPU(cpusetList, isolatedList string) (int, bool, error) {
if isolatedList == "" || cpusetList == "" {
return 0, false, nil
}

// The target container has a cgroup cpuset, get the bit range.
cpusetBits, err := systemd.RangeToBits(cpusetList)
if err != nil {
return 0, false, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err)
}

isolatedBits, err := systemd.RangeToBits(isolatedList)
if err != nil {
return 0, false, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err)
}

affinityCore := 0
isolatedCores := 0
cpusetCores := 0

// start from cpu core #0
currentCore := 0
// CPU core start from the first slice element and bits are read
// from the least to the most significant bit.
for byteRange := 0; byteRange < len(cpusetBits); byteRange++ {
for bit := 0; bit < 8; bit++ {
if cpusetBits[byteRange]&(1<<bit) != 0 {
// add the first core of the cgroup cpuset to the affinity set
if cpusetCores == 0 {
affinityCore = currentCore
}
// cpuset cores count
cpusetCores++
// isolated cores count
if byteRange < len(isolatedBits) {
if isolatedBits[byteRange]&(1<<bit) != 0 {
isolatedCores++
}
}
}
currentCore++
}
}

// we have a cpuset with only isolated cores
return affinityCore, cpusetCores > 0 && isolatedCores == cpusetCores, nil
}
90 changes: 90 additions & 0 deletions libcontainer/container_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,93 @@ func TestGetContainerStateAfterUpdate(t *testing.T) {
t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory)
}
}

func TestGetEligibleCPU(t *testing.T) {
tests := []struct {
name string
cpuset string
isolset string
expectedErr bool
expectedAffinityCore int
expectedEligible bool
}{
{
name: "no cpuset",
isolset: "2-15,18-31,34-47",
expectedEligible: false,
},
{
name: "no isolated set",
cpuset: "0-15",
expectedEligible: false,
},
{
name: "bad cpuset format",
cpuset: "core0 to core15",
isolset: "2-15,18-31,34-47",
expectedErr: true,
},
{
name: "bad isolated set format",
cpuset: "0-15",
isolset: "core0 to core15",
expectedErr: true,
},
{
name: "no eligible core",
cpuset: "0-1,16-17,32-33",
isolset: "2-15,18-31,34-47",
expectedEligible: false,
},
{
name: "no eligible core mixed",
cpuset: "0-31",
isolset: "2-15,18-31,34-47",
expectedEligible: false,
},
{
name: "eligible core #4",
cpuset: "4-7",
isolset: "2-15,18-31,34-47",
expectedEligible: true,
expectedAffinityCore: 4,
},
{
name: "eligible core #40",
cpuset: "40-47",
isolset: "2-15,18-31,34-47",
expectedEligible: true,
expectedAffinityCore: 40,
},
{
name: "eligible core #24",
cpuset: "24-31",
isolset: "2-15,18-31,34-47",
expectedEligible: true,
expectedAffinityCore: 24,
},
{
name: "debug test",
cpuset: "7,14,20,25",
isolset: "6,10,18,30",
expectedEligible: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
affinityCore, eligible, err := getEligibleCPU(tt.cpuset, tt.isolset)
if err != nil && !tt.expectedErr {
t.Fatalf("unexpected error: %s", err)
} else if err == nil && tt.expectedErr {
t.Fatalf("unexpected success")
} else if tt.expectedEligible && !eligible {
t.Fatalf("was expecting eligible core but no eligible core returned")
} else if !tt.expectedEligible && eligible {
t.Fatalf("was not expecting eligible core but got eligible core")
} else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore {
t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore)
}
})
}
}
1 change: 1 addition & 0 deletions libcontainer/message_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
CPUAffinityAttr uint16 = 27291
)

type Int32msg struct {
Expand Down
33 changes: 33 additions & 0 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ struct nlconfig_t {
/* Mount sources opened outside the container userns. */
char *mountsources;
size_t mountsources_len;

/* Temporary CPU affinity before cgroup cpuset transition. */
uint32_t cpu_affinity;
};

/*
Expand Down Expand Up @@ -127,6 +130,7 @@ static int loglevel = DEBUG;
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
#define MOUNT_SOURCES_ATTR 27290
#define CPU_AFFINITY_ATTR 27291

/*
* Use the raw syscall for versions of glibc which don't include a function for
Expand Down Expand Up @@ -506,6 +510,9 @@ static void nl_parse(int fd, struct nlconfig_t *config)
if (len != size)
bail("failed to read netlink payload, %zu != %zu", len, size);

/* No cpu affinity by default: int32(-1) */
config->cpu_affinity = ~0;

/* Parse the netlink payload. */
config->data = data;
while (current < data + size) {
Expand Down Expand Up @@ -554,6 +561,9 @@ static void nl_parse(int fd, struct nlconfig_t *config)
config->mountsources = current;
config->mountsources_len = payload_len;
break;
case CPU_AFFINITY_ATTR:
config->cpu_affinity = readint32(current);
break;
default:
bail("unknown netlink message type %d", nlattr->nla_type);
}
Expand Down Expand Up @@ -1264,6 +1274,29 @@ void nsexec(void)
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
}

/*
* Set temporary single CPU affinity before cgroup cpuset transition,
* this handles a corner case when joining a container having all
* the processes running exclusively on isolated CPU cores to force
* the kernel to schedule runc process on the first CPU core within the
* cgroups cpuset. The introduction of the kernel commit
* 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic
* scheduling behavior by distributing tasks across CPU cores within the
* cgroups cpuset. Some intensive real-time application are relying on this
* deterministic behavior and use the first CPU core to run a slow thread
* while other CPU cores are fully used by real-time threads with SCHED_FIFO
* policy. Such applications prevent runc process from joining a container
* when the runc process is randomly scheduled on a CPU core owned by a
* real-time thread.
*/
if ((int32_t) config.cpu_affinity >= 0) {
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(config.cpu_affinity, &set);
if (sched_setaffinity(0, sizeof(set), &set) == -1)
bail("sched_setaffinity failed");
}

/*
* TODO: What about non-namespace clone flags that we're dropping here?
*
Expand Down

0 comments on commit 85f2d35

Please sign in to comment.