From 27f71755f34cfc789f54f45638ee467fd16a7eb6 Mon Sep 17 00:00:00 2001 From: Nimrod Shneor Date: Sat, 29 May 2021 13:31:42 +0300 Subject: [PATCH] Add support for cgroups managed by systemd --- Cargo.lock | 98 ++++++++++++- Cargo.toml | 1 + src/cgroups/common.rs | 18 ++- src/cgroups/v2/mod.rs | 2 + src/cgroups/v2/systemd_manager.rs | 224 ++++++++++++++++++++++++++++++ src/create.rs | 11 +- src/main.rs | 13 +- 7 files changed, 354 insertions(+), 13 deletions(-) create mode 100644 src/cgroups/v2/systemd_manager.rs diff --git a/Cargo.lock b/Cargo.lock index 9eb862e43d..00ff1bda81 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,6 +46,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "build-env" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1522ac6ee801a11bf9ef3f80403f4ede6eb41291fac3dde3de09989679305f25" + [[package]] name = "byteorder" version = "1.4.3" @@ -69,6 +75,12 @@ version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -126,7 +138,17 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", +] + +[[package]] +name = "cstr-argument" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20bd4e8067c20c7c3a4dea759ef91d4b18418ddb5bd8837ef6e2f2f93ca7ccbb" +dependencies = [ + "cfg-if 0.1.10", + "memchr", ] [[package]] @@ -156,12 +178,39 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crc32fast", "libc", "miniz_oxide", ] +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63f713f8b2aa9e24fec85b0e290c56caee12e3b6ae0aeeda238a75b28251afd6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7684cf33bb7f28497939e8c7cf17e3e4e3b8d9a0080ffa4f8ae2f515442ee855" + [[package]] name = "futures" version = "0.3.13" @@ -319,13 +368,24 @@ version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cca32fa0182e8c0989459524dc356b8f2b5c10f1b9eb521b7d182c03cf8c5ff" +[[package]] +name = "libsystemd-sys" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e03fd580bcecda68dcdcd5297085ade6a3dc552cd8b030d2b94a9b089ef7ab8" +dependencies = [ + "build-env", + "libc", + "pkg-config", +] + [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -375,7 +435,7 @@ checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", ] @@ -452,6 +512,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + [[package]] name = "prctl" version = "1.0.0" @@ -597,7 +663,7 @@ version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "winapi", ] @@ -619,6 +685,21 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "systemd" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f722cabda922e471742300045f56dbaa53fafbb4520fca304e51258019bfe91d" +dependencies = [ + "cstr-argument", + "foreign-types", + "libc", + "libsystemd-sys", + "log", + "memchr", + "utf8-cstr", +] + [[package]] name = "termcolor" version = "1.1.2" @@ -686,6 +767,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +[[package]] +name = "utf8-cstr" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55bcbb425141152b10d5693095950b51c3745d019363fc2929ffd8f61449b628" + [[package]] name = "vec_map" version = "0.8.2" @@ -755,4 +842,5 @@ dependencies = [ "regex", "serde", "serde_json", + "systemd", ] diff --git a/Cargo.toml b/Cargo.toml index 4cbe34dfeb..cd93430ab8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ once_cell = "1.6.0" futures = { version = "0.3", features = ["thread-pool"] } regex = "1.5" oci_spec = { version = "0.1.0", path = "./oci_spec" } +systemd = "0.8.2" \ No newline at end of file diff --git a/src/cgroups/common.rs b/src/cgroups/common.rs index 77ae5abe0a..bbae6e11a0 100644 --- a/src/cgroups/common.rs +++ b/src/cgroups/common.rs @@ -10,6 +10,7 @@ use anyhow::{bail, Result}; use nix::unistd::Pid; use oci_spec::LinuxResources; use procfs::process::Process; +use systemd::daemon::booted; use crate::cgroups::v1; use crate::cgroups::v2; @@ -50,7 +51,10 @@ pub fn write_cgroup_file>(path: P, data: &str) -> Result<()> { Ok(()) } -pub fn create_cgroup_manager>(cgroup_path: P) -> Result> { +pub fn create_cgroup_manager>( + cgroup_path: P, + systemd_cgroup: bool, +) -> Result> { let cgroup_mount = Process::myself()? .mountinfo()? .into_iter() @@ -78,13 +82,23 @@ pub fn create_cgroup_manager>(cgroup_path: P) -> Result { log::info!("cgroup manager V2 will be used"); + if systemd_cgroup { + if !booted()? { + bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); + } + log::info!("systemd cgroup manager will be used"); + return Ok(Box::new(v2::SystemDCGroupManager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)); + } Ok(Box::new(v2::manager::Manager::new( cgroup2.mount_point, cgroup_path.into(), )?)) } _ => Ok(Box::new(v1::manager::Manager::new(cgroup_path.into())?)), - } + } } _ => bail!("could not find cgroup filesystem"), } diff --git a/src/cgroups/v2/mod.rs b/src/cgroups/v2/mod.rs index df18307cd2..fe96f86daa 100644 --- a/src/cgroups/v2/mod.rs +++ b/src/cgroups/v2/mod.rs @@ -7,3 +7,5 @@ mod io; pub mod manager; mod memory; mod pids; +pub mod systemd_manager; +pub use systemd_manager::SystemDCGroupManager; diff --git a/src/cgroups/v2/systemd_manager.rs b/src/cgroups/v2/systemd_manager.rs new file mode 100644 index 0000000000..538a57bc6a --- /dev/null +++ b/src/cgroups/v2/systemd_manager.rs @@ -0,0 +1,224 @@ +use std::{ + fs::{self}, + os::unix::fs::PermissionsExt, +}; + +use anyhow::{anyhow, Result}; +use nix::unistd::Pid; +use oci_spec::LinuxResources; +use std::path::{Path, PathBuf}; + +use super::{cpu::Cpu, cpuset::CpuSet, hugetlb::HugeTlb, io::Io, memory::Memory, pids::Pids}; +use crate::cgroups::common; +use crate::cgroups::common::{write_cgroup_file, CgroupManager}; +use crate::cgroups::v2::controller::Controller; +use crate::cgroups::v2::controller_type::ControllerType; +use crate::utils::PathBufExt; + +const CGROUP_PROCS: &str = "cgroup.procs"; +const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; +const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; + +// v2 systemd only supports cpu, io, memory and pids. +const CONTROLLER_TYPES: &[ControllerType] = &[ + ControllerType::Cpu, + ControllerType::Io, + ControllerType::Memory, + ControllerType::Pids, +]; + +/// SystemDCGroupManager is a driver for managing cgroups via systemd. +pub struct SystemDCGroupManager { + root_path: PathBuf, + conf: Config, +} + +/// Represents the systemd cgroup path: +/// It should be of the form [slice]:[prefix]:[name]. +struct Config { + parent: String, + scope: String, + name: String, +} + +impl SystemDCGroupManager { + pub fn new(root_path: PathBuf, cgroups_path: PathBuf) -> Result { + // cgroups path may never be empty as it is defaulted to `/youki` + // see 'get_cgroup_path' under utils.rs. + // if cgroups_path was provided it should be of the form [slice]:[scope_prefix]:[name], + // for example: "system.slice:docker:1234". + let (mut parent, mut scope, mut name) = ("", "", ""); + if cgroups_path.starts_with("/youki") { + scope = "youki"; + name = cgroups_path + .strip_prefix("/youki/")? + .to_str() + .ok_or(anyhow!("Failed to parse cgroupsPath field."))?; + } else { + let parts = cgroups_path + .to_str() + .ok_or(anyhow!("Failed to parse cgroupsPath field."))? + .split(":") + .collect::>(); + parent = parts[0]; + scope = parts[1]; + name = parts[2]; + } + + let conf = Config { + parent: parent.to_string(), + scope: scope.to_string(), + name: name.to_string(), + }; + + Ok(SystemDCGroupManager { root_path, conf }) + } + + // returns the unit (scope) name from the path provided by the user + // for example: foo:docker:bar returns in '/docker-bar.scope' + fn get_unit_name(&self) -> String { + // By default we create a scope unless specified explicitly. + if !self.conf.name.ends_with(".slice") { + return format!("/{}-{}.scope", self.conf.scope, self.conf.name); + } + return self.conf.name.clone(); + } + + // generates a cgroups path from the one provided by the user via cgroupsPath. + // an example of the final path in rootless: + // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + fn get_cgroups_path(&self) -> Result { + // the root slice is under 'system.slice'. + let mut slice = Path::new("/system.slice").to_path_buf(); + // if the user provided a '.slice' (as in a branch of a tree) + // we need to "unpack it". + if self.conf.parent != "" { + slice = self.expand_slice(self.conf.parent.clone())?; + } + let unit_name = self.get_unit_name(); + let cgroups_path = slice.join(unit_name); + + // an example of the final path: + // "/sys/fs/cgroup/system.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + let full_path = self.root_path.join_absolute_path(&cgroups_path)?; + Ok(full_path) + } + + // systemd represents slice hierarchy using `-`, so we need to follow suit when + // generating the path of slice. Essentially, test-a-b.slice becomes + // /test.slice/test-a.slice/test-a-b.slice. + fn expand_slice(&self, slice: String) -> Result { + let suffix = ".slice"; + if slice.len() <= suffix.len() || !slice.ends_with(suffix) { + anyhow!("invalid slice name: {}", slice); + } + if slice.contains("/") { + anyhow!("invalid slice name: {}", slice); + } + let mut path = "".to_owned(); + let mut prefix = "".to_owned(); + let slice_name = slice.trim_end_matches(suffix); + // if input was -.slice, we should just return root now + if slice_name == "-" { + return Ok(Path::new("/").to_path_buf()); + } + for component in slice_name.split("-") { + if component == "" { + anyhow!("Invalid slice name: {}", slice); + } + // Append the component to the path and to the prefix. + path = format!("{}/{}{}{}", path, prefix, component, suffix); + prefix = format!("{}{}-", prefix, component); + } + Ok(Path::new(&path).to_path_buf()) + } + + /// create_unified_cgroup create unified cgroup makes sure that *each level* in the downward path from the root cgroup + /// down to the cgroup_path provided by the user is a valid cgroup hierarchy. + /// containing the attached controllers and that it contains the container pid. + fn create_unified_cgroup(&self, pid: Pid) -> Result { + let cgroups_path = self.get_cgroups_path()?; + let controllers: Vec = self + .get_available_controllers(common::DEFAULT_CGROUP_ROOT)? + .into_iter() + .map(|c| format!("{}{}", "+", c.to_string())) + .collect(); + + let mut current_path = self.root_path.clone(); + let mut components = cgroups_path.components().skip(1).peekable(); + // Verify that *each level* in the downward path from the root cgroup + // down to the cgroup_path provided by the user is a valid cgroup hierarchy. + // containing the attached controllers. + while let Some(component) = components.next() { + current_path = current_path.join(component); + if !current_path.exists() { + fs::create_dir(¤t_path)?; + fs::metadata(¤t_path)?.permissions().set_mode(0o755); + } + + // last component cannot have subtree_control enabled due to internal process constraint + // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy) + if components.peek().is_some() { + for controller in &controllers { + write_cgroup_file(¤t_path.join(CGROUP_SUBTREE_CONTROL), controller)?; + } + } + } + + write_cgroup_file(cgroups_path.join(CGROUP_PROCS), &pid.to_string())?; + Ok(cgroups_path) + } + + fn get_available_controllers>( + &self, + cgroups_path: P, + ) -> Result> { + let controllers_path = self.root_path.join(cgroups_path).join(CGROUP_CONTROLLERS); + if !controllers_path.exists() { + return Err(anyhow!( + "cannot get available controllers. {:?} does not exist", + controllers_path + )); + } + + let mut controllers = Vec::new(); + for controller in fs::read_to_string(&controllers_path)?.split_whitespace() { + match controller { + "cpu" => controllers.push(ControllerType::Cpu), + "io" => controllers.push(ControllerType::Io), + "memory" => controllers.push(ControllerType::Memory), + "pids" => controllers.push(ControllerType::Pids), + _ => continue, + } + } + + Ok(controllers) + } +} + +impl CgroupManager for SystemDCGroupManager { + fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()> { + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid.as_raw() == -1 { + return Ok(()); + } + let full_cgroup_path = self.create_unified_cgroup(pid)?; + + for controller in CONTROLLER_TYPES { + match controller { + ControllerType::Cpu => Cpu::apply(linux_resources, &full_cgroup_path)?, + ControllerType::CpuSet => CpuSet::apply(linux_resources, &full_cgroup_path)?, + ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &&full_cgroup_path)?, + ControllerType::Io => Io::apply(linux_resources, &&full_cgroup_path)?, + ControllerType::Memory => Memory::apply(linux_resources, &full_cgroup_path)?, + ControllerType::Pids => Pids::apply(linux_resources, &&full_cgroup_path)?, + } + } + + Ok(()) + } + + fn remove(&self) -> Result<()> { + Ok(()) + } +} diff --git a/src/create.rs b/src/create.rs index 4719e303f1..a62d227c61 100644 --- a/src/create.rs +++ b/src/create.rs @@ -45,7 +45,12 @@ pub struct Create { // associated with it like any other process. impl Create { /// Starts a new container process - pub fn exec(&self, root_path: PathBuf, command: impl Command) -> Result<()> { + pub fn exec( + &self, + root_path: PathBuf, + systemd_cgroup: bool, + command: impl Command, + ) -> Result<()> { // create a directory for the container to store state etc. // if already present, return error let bundle_canonicalized = fs::canonicalize(&self.bundle) @@ -102,6 +107,7 @@ impl Create { rootfs, spec, csocketfd, + systemd_cgroup, container, command, )?; @@ -121,6 +127,7 @@ fn run_container>( rootfs: PathBuf, spec: oci_spec::Spec, csocketfd: Option, + systemd_cgroup: bool, container: Container, command: impl Command, ) -> Result { @@ -133,7 +140,7 @@ fn run_container>( let namespaces: Namespaces = linux.namespaces.clone().into(); let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id()); - let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?; + let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, systemd_cgroup)?; // first fork, which creates process, which will later create actual container process match fork::fork_first( diff --git a/src/main.rs b/src/main.rs index dc058d261d..fad24e388e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,9 +15,8 @@ use youki::create; use youki::signal; use youki::start; -use youki::utils; use youki::cgroups; - +use youki::utils; /// High-level commandline option definition /// This takes global options as well as individual commands as specified in [OCI runtime-spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) @@ -32,6 +31,9 @@ struct Opts { log: Option, #[clap(long)] log_format: Option, + /// Enable systemd cgroup manager, rather then use the cgroupfs directly. + #[clap(short, long)] + systemd_cgroup: bool, /// command to actually manage container #[clap(subcommand)] subcmd: SubCommand, @@ -81,8 +83,10 @@ fn main() -> Result<()> { let root_path = PathBuf::from(&opts.root); fs::create_dir_all(&root_path)?; + let systemd_cgroup = opts.systemd_cgroup; + match opts.subcmd { - SubCommand::Create(create) => create.exec(root_path, LinuxCommand), + SubCommand::Create(create) => create.exec(root_path, systemd_cgroup, LinuxCommand), SubCommand::Start(start) => start.exec(root_path), SubCommand::Kill(kill) => { // resolves relative paths, symbolic links etc. and get complete path @@ -144,7 +148,8 @@ fn main() -> Result<()> { // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let cmanager = cgroups::common::create_cgroup_manager(cgroups_path)?; + let cmanager = + cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?; cmanager.remove()?; } std::process::exit(0)