From 6bc2a312f710f8c9a91dea22469ba85a25e8fb96 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 30 Oct 2023 18:09:07 -0700 Subject: [PATCH] runc kill: fix sending KILL to non-pidns container Commit f8ad20f made it impossible to kill leftover processes in a stopped container that does not have its own PID namespace. In other words, if a container init is gone, it is no longer possible to use `runc kill` to kill the leftover processes. Fix this by moving the check if container init exists to after the special case of handling the container without own PID namespace. While at it, fix the minor issue introduced by commit 9583b3d: if signalAllProcesses is used, there is no need to thaw the container (as freeze/thaw is either done in signalAllProcesses already, or not needed at all). Also, make signalAllProcesses return an error early if the container cgroup does not exist (as it relies on it to do its job). This way, the error message returned is more generic and easier to understand ("container not running" instead of "can't open file"). Finally, add a test case. Fixes: f8ad20f Fixes: 9583b3d Co-authored-by: lifubang Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 21 +++++++------- libcontainer/init_linux.go | 3 ++ tests/integration/kill.bats | 49 +++++++++++++++++++++++++++++---- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index f203576d358..5eea7cfa140 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -364,10 +364,6 @@ func (c *Container) start(process *Process) (retErr error) { func (c *Container) Signal(s os.Signal) error { c.m.Lock() defer c.m.Unlock() - // To avoid a PID reuse attack, don't kill non-running container. - if !c.hasInit() { - return ErrNotRunning - } // When a container has its own PID namespace, inside it the init PID // is 1, and thus it is handled specially by the kernel. In particular, @@ -375,14 +371,19 @@ func (c *Container) Signal(s os.Signal) error { // all other processes in that PID namespace (see pid_namespaces(7)). // // OTOH, if PID namespace is shared, we should kill all pids to avoid - // leftover processes. - var err error + // leftover processes. Handle this special case here. if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { - err = signalAllProcesses(c.cgroupManager, unix.SIGKILL) - } else { - err = c.initProcess.signal(s) + if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { + return fmt.Errorf("unable to kill all processes: %w", err) + } + return nil } - if err != nil { + + // To avoid a PID reuse attack, don't kill non-running container. + if !c.hasInit() { + return ErrNotRunning + } + if err := c.initProcess.signal(s); err != nil { return fmt.Errorf("unable to signal init: %w", err) } if s == unix.SIGKILL { diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 624e0199e9a..293f7e64115 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -661,6 +661,9 @@ func setupPersonality(config *configs.Config) error { // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { + if !m.Exists() { + return ErrNotRunning + } // Use cgroup.kill, if available. if s == unix.SIGKILL { if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid. diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index 8f2f4a7b4e8..bbd22753d14 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -10,6 +10,7 @@ function teardown() { teardown_bundle } +# shellcheck disable=SC2030 @test "kill detached busybox" { # run busybox detached runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox @@ -35,15 +36,12 @@ function teardown() { [ "$status" -eq 0 ] } -# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. -@test "kill KILL [host pidns]" { - # kill -a currently requires cgroup freezer. +test_host_pidns_kill() { requires cgroups_freezer update_config ' .linux.namespaces -= [{"type": "pid"}]' set_cgroups_path if [ $EUID -ne 0 ]; then - # kill --all requires working cgroups. requires rootless_cgroup update_config ' .mounts |= map((select(.type == "proc") | .type = "none" @@ -53,14 +51,23 @@ function teardown() { fi runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + # shellcheck disable=SC2031 [ "$status" -eq 0 ] # Start a few more processes. for _ in 1 2 3 4 5; do __runc exec -d test_busybox sleep 1h - [ "$status" -eq 0 ] done + if [ -v KILL_INIT ]; then + # Now kill the container's init process (sh). Since the container do + # not have own PID ns, its init is no special and the container will + # still be up and running (except for rootless container AND + # systemd cgroup driver AND systemd > v245, when systemd kills + # the container; see "kill KILL [host pidns + init gone]" below). + __runc exec test_busybox killall -9 sh + fi + # Get the list of all container processes. path=$(get_cgroup_path "pids") pids=$(cat "$path"/cgroup.procs) @@ -71,6 +78,7 @@ function teardown() { done runc kill test_busybox KILL + # shellcheck disable=SC2031 [ "$status" -eq 0 ] wait_for_container 10 1 test_busybox stopped @@ -79,6 +87,37 @@ function teardown() { echo "pids: $pids" for p in $pids; do run ! kill -0 "$p" + # shellcheck disable=SC2031 [[ "$output" = *"No such process" ]] done } + +# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. +# The differences are: +# +# 1. Here we use separate processes to create and to kill a container, so the +# processes inside a container are not children of "runc kill". +# +# 2. We hit different codepaths (nonChildProcess.signal rather than initProcess.signal). +@test "kill KILL [host pidns]" { + unset KILL_INIT + test_host_pidns_kill +} + +# Same as above plus: +# 3. Test runc kill on a container whose init process is gone (issue 4047). +@test "kill KILL [host pidns + init gone]" { + # Apparently, for rootless test, when using systemd cgroup manager, + # newer versions of systemd clean up the container as soon as its init + # process is gone. This is all fine and dandy, except it prevents us to + # test this case, thus we skip the test. + # + # It is not entirely clear which systemd version got this feature: + # v245 works fine, and v249 does not. + if [ $EUID -ne 0 ] && [ -v RUNC_USE_SYSTEMD ] && [ "$(systemd_version)" -gt 245 ]; then + skip "rootless+systemd conflicts with systemd > 245" + fi + KILL_INIT=1 + test_host_pidns_kill + unset KILL_INIT +}