Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix runc kill and runc delete for containers with no init and no private PID namespace #4102

Merged
merged 8 commits into from
Nov 28, 2023
4 changes: 3 additions & 1 deletion checkpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ checkpointed.`,
err = container.Checkpoint(options)
if err == nil && !(options.LeaveRunning || options.PreDump) {
// Destroy the container unless we tell CRIU to keep it.
destroy(container)
if err := container.Destroy(); err != nil {
logrus.Warn(err)
}
}
return err
},
Expand Down
18 changes: 10 additions & 8 deletions delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ func killContainer(container *libcontainer.Container) error {
for i := 0; i < 100; i++ {
time.Sleep(100 * time.Millisecond)
if err := container.Signal(unix.Signal(0)); err != nil {
destroy(container)
return nil
return container.Destroy()
}
}
return errors.New("container init still running")
Expand Down Expand Up @@ -66,22 +65,25 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
}
return err
}
// When --force is given, we kill all container processes and
// then destroy the container. This is done even for a stopped
// container, because (in case it does not have its own PID
// namespace) there may be some leftover processes in the
// container's cgroup.
if force {
lifubang marked this conversation as resolved.
Show resolved Hide resolved
return killContainer(container)
}
s, err := container.Status()
if err != nil {
return err
}
switch s {
case libcontainer.Stopped:
destroy(container)
return container.Destroy()
lifubang marked this conversation as resolved.
Show resolved Hide resolved
case libcontainer.Created:
return killContainer(container)
default:
if force {
return killContainer(container)
}
return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
}

return nil
},
}
74 changes: 32 additions & 42 deletions libcontainer/cgroups/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,79 +217,69 @@ func PathExists(path string) bool {
return true
}

func rmdir(path string) error {
// rmdir tries to remove a directory, optionally retrying on EBUSY.
func rmdir(path string, retry bool) error {
delay := time.Millisecond
tries := 10

again:
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
switch err { // nolint:errorlint // unix errors are bare
case nil, unix.ENOENT:
return nil
case unix.EINTR:
goto again
case unix.EBUSY:
if retry && tries > 0 {
time.Sleep(delay)
delay *= 2
tries--
goto again

}
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}

// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
// Try the fast path first.
if err := rmdir(path, false); err == nil {
return nil
}

infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
if err != nil && !os.IsNotExist(err) {
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
// We should remove subcgroup first.
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
err = rmdir(path, true)
}
return err
}

// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
// cgroups
if os.IsNotExist(err) {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
for s, p := range paths {
if err := RemovePath(p); err == nil {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
// TODO: switch to clear once Go < 1.21 is not supported.
paths = make(map[string]string)
return nil
}
return fmt.Errorf("Failed to remove paths: %v", paths)
}

Expand Down
68 changes: 33 additions & 35 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func (c *Container) ignoreCgroupError(err error) error {
if err == nil {
return nil
}
if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() {
return nil
}
return err
Expand Down Expand Up @@ -364,37 +364,35 @@ func (c *Container) start(process *Process) (retErr error) {
func (c *Container) Signal(s os.Signal) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
// To avoid a PID reuse attack, don't kill non-running container.
switch status {
case Running, Created, Paused:
default:
return ErrNotRunning
}

// When a container has its own PID namespace, inside it the init PID
// is 1, and thus it is handled specially by the kernel. In particular,
// killing init with SIGKILL from an ancestor namespace will also kill
// all other processes in that PID namespace (see pid_namespaces(7)).
//
// OTOH, if PID namespace is shared, we should kill all pids to avoid
// leftover processes.
// leftover processes. Handle this special case here.
if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
} else {
err = c.initProcess.signal(s)
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
return fmt.Errorf("unable to kill all processes: %w", err)
}
return nil
}
if err != nil {

// To avoid a PID reuse attack, don't kill non-running container.
if !c.hasInit() {
return ErrNotRunning
}
if err := c.initProcess.signal(s); err != nil {
return fmt.Errorf("unable to signal init: %w", err)
}
if status == Paused && s == unix.SIGKILL {
if s == unix.SIGKILL {
// For cgroup v1, killing a process in a frozen cgroup
// does nothing until it's thawed. Only thaw the cgroup
// for SIGKILL.
_ = c.cgroupManager.Freeze(configs.Thawed)
if paused, _ := c.isPaused(); paused {
_ = c.cgroupManager.Freeze(configs.Thawed)
}
}
return nil
}
Expand Down Expand Up @@ -876,7 +874,10 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
func (c *Container) Destroy() error {
c.m.Lock()
defer c.m.Unlock()
return c.state.destroy()
if err := c.state.destroy(); err != nil {
return fmt.Errorf("unable to destroy container: %w", err)
}
return nil
}

// Pause pauses the container, if its state is RUNNING or CREATED, changing
Expand Down Expand Up @@ -1006,34 +1007,31 @@ func (c *Container) refreshState() error {
if paused {
return c.state.transition(&pausedState{c: c})
}
t := c.runType()
switch t {
case Created:
if !c.hasInit() {
return c.state.transition(&stoppedState{c: c})
}
// The presence of exec fifo helps to distinguish between
// the created and the running states.
if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
return c.state.transition(&runningState{c: c})
}

func (c *Container) runType() Status {
// hasInit tells whether the container init process exists.
func (c *Container) hasInit() bool {
if c.initProcess == nil {
return Stopped
return false
}
pid := c.initProcess.pid()
stat, err := system.Stat(pid)
if err != nil {
return Stopped
return false
}
if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
return Stopped
}
// We'll create exec fifo and blocking on it after container is created,
// and delete it after start container.
if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
return Created
return false
}
return Running
return true
}

func (c *Container) isPaused() (bool, error) {
Expand Down
3 changes: 3 additions & 0 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,9 @@ func setupPersonality(config *configs.Config) error {
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
if !m.Exists() {
return ErrNotRunning
cyphar marked this conversation as resolved.
Show resolved Hide resolved
}
// Use cgroup.kill, if available.
if s == unix.SIGKILL {
if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
Expand Down
44 changes: 27 additions & 17 deletions libcontainer/state_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,30 @@ type containerState interface {
}

func destroy(c *Container) error {
err := c.cgroupManager.Destroy()
// Usually, when a container init is gone, all other processes in its
// cgroup are killed by the kernel. This is not the case for a shared
// PID namespace container, which may have some processes left after
// its init is killed or exited.
//
// As the container without init process running is considered stopped,
// and destroy is supposed to remove all the container resources, we need
// to kill those processes here.
if !c.config.Namespaces.IsPrivate(configs.NEWPID) {
_ = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
}
if err := c.cgroupManager.Destroy(); err != nil {
return fmt.Errorf("unable to remove container's cgroup: %w", err)
}
if c.intelRdtManager != nil {
if ierr := c.intelRdtManager.Destroy(); err == nil {
err = ierr
if err := c.intelRdtManager.Destroy(); err != nil {
return fmt.Errorf("unable to remove container's IntelRDT group: %w", err)
}
}
if rerr := os.RemoveAll(c.stateDir); err == nil {
err = rerr
if err := os.RemoveAll(c.stateDir); err != nil {
return fmt.Errorf("unable to remove container state dir: %w", err)
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
err := runPoststopHooks(c)
c.state = &stoppedState{c: c}
return err
}
Expand Down Expand Up @@ -103,7 +114,7 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
if r.c.runType() == Running {
if r.c.hasInit() {
return ErrRunning
}
r.c.state = s
Expand All @@ -118,7 +129,7 @@ func (r *runningState) transition(s containerState) error {
}

func (r *runningState) destroy() error {
if r.c.runType() == Running {
if r.c.hasInit() {
return ErrRunning
}
return destroy(r.c)
Expand Down Expand Up @@ -170,14 +181,13 @@ func (p *pausedState) transition(s containerState) error {
}

func (p *pausedState) destroy() error {
t := p.c.runType()
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
if p.c.hasInit() {
return ErrPaused
Copy link

@ssst0n3 ssst0n3 Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when t == Created, hasInit() returns false:

  • Before this commit: It returns ErrPaused.
  • After this commit: p.c.cgroupManager.Freeze(configs.Thawed) is executed.

}
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return ErrPaused
return destroy(p.c)
}

// restoredState is the same as the running state but also has associated checkpoint
Expand Down
Loading
Loading