diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1910472419..a4c40fe7fa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,31 +7,70 @@ on: - main jobs: + rustfmt: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - run: rustup component add rustfmt + - name: Check formatting + run: cargo fmt --all -- --check tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - run: rustup component add clippy + - run: sudo apt-get -y update + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev - uses: actions-rs/clippy-check@v1 with: token: ${{ secrets.GITHUB_TOKEN }} args: --all-features - name: Build - run: cargo build + run: ./build.sh - name: Run tests run: cargo test + - name: Run doc tests + run: cargo test --doc integration_tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 with: submodules: recursive + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - uses: actions-rs/toolchain@v1 with: toolchain: stable - - uses: actions-rs/cargo@v1 - with: - command: build + - run: sudo apt-get -y update + - run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev + - name: Build + run: ./build.sh - uses: actions/setup-go@v2 with: go-version: "1.11.0" diff --git a/.gitignore b/.gitignore index 9039ea9456..5bc7a06de7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ +youki +/tutorial + /target .vagrant/ tags tags.lock tags.temp + diff --git a/.gitmodules b/.gitmodules index ccbf3c8538..73b6451a1f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,5 @@ [submodule "integration_test/src/github.com/opencontainers/runtime-tools"] path = integration_test/src/github.com/opencontainers/runtime-tools url = https://github.com/opencontainers/runtime-tools.git + ignore = dirty + diff --git a/CODE-OF-CONDUCT.md b/CODE-OF-CONDUCT.md new file mode 100644 index 0000000000..f0a6716641 --- /dev/null +++ b/CODE-OF-CONDUCT.md @@ -0,0 +1,3 @@ +## The Youki Project Community Code of Conduct + +The Youki Project follows the [Containers Community Code of Conduct](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md). diff --git a/Cargo.lock b/Cargo.lock index 9eb862e43d..ce95aadce8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,20 +19,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] +checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" [[package]] name = "autocfg" @@ -46,6 +35,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "build-env" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1522ac6ee801a11bf9ef3f80403f4ede6eb41291fac3dde3de09989679305f25" + [[package]] name = "byteorder" version = "1.4.3" @@ -54,9 +49,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "caps" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d092fbb6657fb1f98a7da70c14335ac97e5a9477e1a8156d4bbf19a3a7aece51" +checksum = "c088f2dddef283f86b023ab1ebe2301c653326834996458b2f48d29b804e9540" dependencies = [ "errno", "libc", @@ -65,9 +60,15 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.66" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" +checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" [[package]] name = "cfg-if" @@ -84,6 +85,7 @@ dependencies = [ "libc", "num-integer", "num-traits", + "serde", "time", "winapi", ] @@ -94,14 +96,12 @@ version = "3.0.0-beta.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bd1061998a501ee7d4b6d449020df3266ca3124b941ec56cf2005c3779ca142" dependencies = [ - "atty", "bitflags", "clap_derive", "indexmap", "lazy_static", "os_str_bytes", "strsim", - "termcolor", "textwrap", "unicode-width", "vec_map", @@ -126,7 +126,37 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", +] + +[[package]] +name = "cstr-argument" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20bd4e8067c20c7c3a4dea759ef91d4b18418ddb5bd8837ef6e2f2f93ca7ccbb" +dependencies = [ + "cfg-if 0.1.10", + "memchr", +] + +[[package]] +name = "dbus" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f597e08dfa79b593f23bbfc7840b23b2c5aa2e3a98d8e68b67b5b9ff800dc0db" +dependencies = [ + "libc", + "libdbus-sys", +] + +[[package]] +name = "env_logger" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f" +dependencies = [ + "log", + "regex", ] [[package]] @@ -150,23 +180,59 @@ dependencies = [ "libc", ] +[[package]] +name = "fastrand" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77b705829d1e87f762c2df6da140b26af5839e1033aa84aa5f56bb688e4e1bdb" +dependencies = [ + "instant", +] + [[package]] name = "flate2" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "crc32fast", "libc", "miniz_oxide", ] +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63f713f8b2aa9e24fec85b0e290c56caee12e3b6ae0aeeda238a75b28251afd6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7684cf33bb7f28497939e8c7cf17e3e4e3b8d9a0080ffa4f8ae2f515442ee855" + [[package]] name = "futures" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -179,9 +245,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -189,15 +255,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -207,16 +273,17 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-macro" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg", "proc-macro-hack", "proc-macro2", "quote", @@ -225,22 +292,23 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-util" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg", "futures-channel", "futures-core", "futures-io", @@ -261,6 +329,17 @@ version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi", +] + [[package]] name = "hashbrown" version = "0.9.1" @@ -269,9 +348,9 @@ checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "heck" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ "unicode-segmentation", ] @@ -293,14 +372,23 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "indexmap" -version = "1.6.1" +version = "1.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" +checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3" dependencies = [ "autocfg", "hashbrown", ] +[[package]] +name = "instant" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61124eeebbd69b8190558df225adf7e4caafce0d743919e5d6b19652314ec5ec" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "itoa" version = "0.4.7" @@ -315,9 +403,38 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.84" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" + +[[package]] +name = "libdbus-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc12a3bc971424edbbf7edaf6e5740483444db63aa8e23d3751ff12a30f306f0" +dependencies = [ + "pkg-config", +] + +[[package]] +name = "libsystemd-sys" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e03fd580bcecda68dcdcd5297085ade6a3dc552cd8b030d2b94a9b089ef7ab8" +dependencies = [ + "build-env", + "libc", + "pkg-config", +] + +[[package]] +name = "lock_api" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cca32fa0182e8c0989459524dc356b8f2b5c10f1b9eb521b7d182c03cf8c5ff" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" +dependencies = [ + "scopeguard", +] [[package]] name = "log" @@ -325,7 +442,7 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -334,6 +451,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + [[package]] name = "miniz_oxide" version = "0.4.4" @@ -346,9 +472,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.7.7" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e50ae3f04d169fcc9bde0b547d1c205219b7157e07ded9c5aff03e0637cb3ed7" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" dependencies = [ "libc", "log", @@ -359,11 +485,10 @@ dependencies = [ [[package]] name = "miow" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a33c1b55807fbed163481b5ba66db4b2fa6cde694a5027be10fb724206c5897" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" dependencies = [ - "socket2", "winapi", ] @@ -375,10 +500,23 @@ checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", ] +[[package]] +name = "nix" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3728fec49d363a50a8828a190b379a446cc5cf085c06259bbbeb34447e4ec7" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + [[package]] name = "ntapi" version = "0.3.6" @@ -423,16 +561,17 @@ version = "0.1.0" dependencies = [ "anyhow", "caps", - "nix", + "nix 0.19.1", + "quickcheck", "serde", "serde_json", ] [[package]] name = "once_cell" -version = "1.6.0" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ad167a2f54e832b82dbe003a046280dceffe5227b5f79e08e363a29638cfddd" +checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" [[package]] name = "os_str_bytes" @@ -440,6 +579,31 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85" +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + [[package]] name = "pin-project-lite" version = "0.2.6" @@ -452,6 +616,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + [[package]] name = "prctl" version = "1.0.0" @@ -459,7 +629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52" dependencies = [ "libc", - "nix", + "nix 0.21.0", ] [[package]] @@ -500,9 +670,9 @@ checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" [[package]] name = "proc-macro2" -version = "1.0.24" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" +checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" dependencies = [ "unicode-xid", ] @@ -522,15 +692,53 @@ dependencies = [ "libc", ] +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "env_logger", + "log", + "rand", +] + [[package]] name = "quote" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.5.4" @@ -554,20 +762,26 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "serde" -version = "1.0.123" +version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" dependencies = [ "proc-macro2", "quote", @@ -576,9 +790,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.61" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" +checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" dependencies = [ "itoa", "ryu", @@ -586,22 +800,39 @@ dependencies = [ ] [[package]] -name = "slab" -version = "0.4.2" +name = "serial_test" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +checksum = "e0bccbcf40c8938196944a3da0e133e031a33f4d6b72db3bda3cc556e361905d" +dependencies = [ + "lazy_static", + "parking_lot", + "serial_test_derive", +] [[package]] -name = "socket2" -version = "0.3.19" +name = "serial_test_derive" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" +checksum = "b2acd6defeddb41eb60bb468f8825d0cfd0c2a76bc03bfd235b6a1dc4f6a1ad5" dependencies = [ - "cfg-if", - "libc", - "winapi", + "proc-macro2", + "quote", + "syn", ] +[[package]] +name = "slab" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + [[package]] name = "strsim" version = "0.10.0" @@ -610,9 +841,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.60" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" +checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" dependencies = [ "proc-macro2", "quote", @@ -620,12 +851,27 @@ dependencies = [ ] [[package]] -name = "termcolor" -version = "1.1.2" +name = "systemd" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "f722cabda922e471742300045f56dbaa53fafbb4520fca304e51258019bfe91d" dependencies = [ - "winapi-util", + "cstr-argument", + "foreign-types", + "libc", + "libsystemd-sys", + "log", + "memchr", + "utf8-cstr", +] + +[[package]] +name = "tabwriter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36205cfc997faadcc4b0b87aaef3fbedafe20d38d4959a7ca6ff803564051111" +dependencies = [ + "unicode-width", ] [[package]] @@ -639,18 +885,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" +checksum = "fa6f76457f59514c7eeb4e59d891395fab0b2fd1d40723ae737d64153392e9c6" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" +checksum = "8a36768c0fbf1bb15eca10defa29526bda730a2376c2ab4393ccfa16fb1a318d" dependencies = [ "proc-macro2", "quote", @@ -659,12 +905,11 @@ dependencies = [ [[package]] name = "time" -version = "0.1.44" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" dependencies = [ "libc", - "wasi", "winapi", ] @@ -682,9 +927,15 @@ checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "utf8-cstr" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "55bcbb425141152b10d5693095950b51c3745d019363fc2929ffd8f61449b628" [[package]] name = "vec_map" @@ -694,15 +945,15 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "winapi" @@ -720,15 +971,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -743,16 +985,21 @@ dependencies = [ "caps", "chrono", "clap", + "dbus", + "fastrand", "futures", "libc", "log", "mio", - "nix", + "nix 0.19.1", "oci_spec", "once_cell", "prctl", "procfs", - "regex", + "quickcheck", "serde", "serde_json", + "serial_test", + "systemd", + "tabwriter", ] diff --git a/Cargo.toml b/Cargo.toml index 4cbe34dfeb..3f70b42c71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,12 @@ authors = ["utam0k "] edition = "2018" description = "A container runtime written in Rust" +[dependencies.clap] +version = "3.0.0-beta.2" +default-features = false +features = ["std", "suggestions", "derive"] + [dependencies] -clap = "3.0.0-beta.2" nix = "0.19.1" procfs = "0.9.1" caps = "0.5.1" @@ -17,8 +21,19 @@ libc = "0.2.84" log = "0.4" anyhow = "1.0" mio = { version = "0.7", features = ["os-ext", "os-poll"] } -chrono = "0.4" +chrono = { version="0.4", features = ["serde"] } once_cell = "1.6.0" futures = { version = "0.3", features = ["thread-pool"] } -regex = "1.5" oci_spec = { version = "0.1.0", path = "./oci_spec" } +systemd = { version = "0.8", default-features = false } +dbus = "0.9.2" +tabwriter = "1" +fastrand = "1.4.1" + +[dev-dependencies] +oci_spec = { version = "0.1.0", path = "./oci_spec", features = ["proptests"] } +quickcheck = "1" +serial_test = "0.5.1" + +[profile.release] +lto = true \ No newline at end of file diff --git a/README.md b/README.md index 1738a1e179..89dc3ba8a9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,13 @@ # youki: A container runtime in Rust - +[![Discord](https://img.shields.io/discord/849943000770412575.svg?logo=discord)](https://discord.gg/zHnyXKSQFD) +[![GitHub commit activity](https://img.shields.io/github/commit-activity/m/containers/youki)](https://github.com/containers/youki/graphs/commit-activity) +[![GitHub contributors](https://img.shields.io/github/contributors/containers/youki)](https://github.com/containers/youki/graphs/contributors) +[![Github CI](https://github.com/containers/youki/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/containers/youki/actions) + +

+ +

youki is an implementation of [runtime-spec](https://github.com/opencontainers/runtime-spec) in Rust, referring to [runc](https://github.com/opencontainers/runc). @@ -23,6 +30,21 @@ Here is why I am rewriting a new container runtime in Rust. youki is not at the practical stage yet. However, it is getting closer to practical use, running with docker and passing all the default tests provided by [opencontainers/runtime-tools](https://github.com/opencontainers/runtime-tools). ![youki demo](docs/demo.gif) +## Features + +- [x] run with docker +- [ ] run with podman(WIP on [#24](https://github.com/containers/youki/issues/24)) +- [x] pivot root +- [x] mount devices +- [x] namespaces +- [x] capabilities +- [x] rlimits +- [ ] cgroups v1(WIP on [#9](https://github.com/containers/youki/issues/9)) +- [ ] cgroups v2(WIP on [#78](https://github.com/containers/youki/issues/78)) +- [ ] seccomp(WIP on [#25](https://github.com/containers/youki/issues/25)) +- [ ] hooks(WIP on [#13](https://github.com/containers/youki/issues/13)) +- [ ] rootless(WIP on [#77](https://github.com/containers/youki/issues/77)) + # Getting Started Local build is only supported on linux. @@ -31,16 +53,66 @@ For other platforms, please use the devcontainer that we prepared. ## Requires - Rust(See [here](https://www.rust-lang.org/tools/install)) -- Docker +- Docker(See [here](https://docs.docker.com/engine/install)) + +## Dependencies -## Building +### Debian, Ubuntu and related distributions ```sh -$ git clone git@github.com:utam0k/youki.git +$ sudo apt-get install \ + pkg-config \ + libsystemd-dev \ + libdbus-glib-1-dev +``` + +### Fedora, Centos, RHEL and related distributions + +```sh +$ sudo dnf install \ + pkg-config \ + systemd-devel \ + dbus-devel +``` + +## Build + +```sh +$ git clone git@github.com:containers/youki.git $ cd youki -$ cargo build +$ ./build.sh +$ ./youki -h # you can get information about youki command ``` +## Tutorial + +Let's try to run a container that executes `sleep 5` using youki. +Maybe this tutorial is need permission as root. + +```sh +$ git clone git@github.com:containers/youki.git +$ cd youki +$ ./build.sh +$ mkdir tutorial +$ cd tutorial +$ mkdir rootfs +$ docker export $(docker create busybox) | tar -C rootfs -xvf - +``` + +Prepare a configuration file for the container that will run `sleep 5`. + +```sh +$ curl https://gist.githubusercontent.com/utam0k/8ab419996633066eaf53ac9c66d962e7/raw/e81548f591f26ec03d85ce38b0443144573b4cf6/config.json -o config.json +$ cd ../ +$ ./youki create -b tutorial tutorial_container +$ ./youki state tutorial_container # You can see the state the container is in as it is being generate. +$ ./youki start tutorial_container +$ ./youki state tutorial_container # Run it within 5 seconds to see the running container. +$ ./youki delete tutorial_container # Run it after the container is finished running. +``` + +Change the command to be executed in config.json and try something other than `sleep 5`. + ## Usage Starting the docker daemon. @@ -57,34 +129,29 @@ $ docker run -it --rm --runtime youki busybox ### Integration test -Go and node-tap are required to run integration test. See the [opencontainers/runtime-tools]((https://github.com/opencontainers/runtime-tools) README for details. +Go and node-tap are required to run integration test. See the [opencontainers/runtime-tools](https://github.com/opencontainers/runtime-tools) README for details. ``` $ git submodule update --init --recursive $ ./integration_test.sh ``` -# Design and implementation of youki +# Community -TBD(WIP on [#14](https://github.com/utam0k/youki/issues/14)) +We also have an active [Discord](https://discord.gg/h7R3HgWUct) if you'd like to come and chat with us. -# Features +# Design and implementation of youki -- [x] run with docker -- [ ] run with podman -- [x] pivot root -- [x] mount devices -- [x] namespaces -- [x] capabilities -- [x] rlimits -- [ ] cgroups v1(WIP on [#9](https://github.com/utam0k/youki/issues/9)) -- [ ] cgroups v2 -- [ ] seccomp -- [ ] hooks(WIP on [#13](https://github.com/utam0k/youki/issues/13)) -- [ ] rootless +TBD(WIP on [#14](https://github.com/containers/youki/issues/14)) # Contribution This project welcomes your PR and issues. For example, refactoring, adding features, correcting English, etc. If you need any help, you can contact me on [Twitter](https://twitter.com/utam0k). + +Thanks to all the people who already contributed! + + + + diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..3b75d20c12 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,3 @@ +## Security and Disclosure Information Policy for the Youki Project + +The Youki Project follows the [Security and Disclosure Information Policy](https://github.com/containers/common/blob/main/SECURITY.md) for the Containers Projects. diff --git a/build.sh b/build.sh new file mode 100755 index 0000000000..1beaa739a2 --- /dev/null +++ b/build.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +TARGET=${TARGET-x86_64-unknown-linux-gnu} +if [ "$TARGET" != "" ]; then + TGT="--target $TARGET" +fi +VERSION=debug +if [[ "$1" == "--release" ]]; then + VERSION=release +fi + +cargo build --verbose $TGT $1 +rm -f youki +cp target/$TARGET/$VERSION/youki . diff --git a/docs/doc-draft.md b/docs/doc-draft.md index 72eb37f2a4..c8dfd7d40b 100644 --- a/docs/doc-draft.md +++ b/docs/doc-draft.md @@ -7,6 +7,11 @@ These are references to various documentations and specifications, which can be - [OCI runtime specification] : The specification for a container runtime. Any OCI complaisant runtime must follow this. - [runc man pages] : has information on various commandline options supported by runc, can be used to understand commands and their options. - [cgroups man page](https://man7.org/linux/man-pages/man7/cgroups.7.html) : contains information about cgroups, their creation, deletion etc. +- [pseudoterminal man page](https://man7.org/linux/man-pages/man7/pty.7.html) : Information about the pseudoterminal system, useful to understand console_socket parameter in create subcommand +- [Unix Sockets man page](https://man7.org/linux/man-pages/man7/unix.7.html) : Useful to understand sockets +- [prctl man page](https://man7.org/linux/man-pages/man2/prctl.2.html) : Process control man pages +- [OCI Linux spec](https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md) : Linux specific section of OCI Spec +- [pipe2 man page](https://man7.org/linux/man-pages/man2/pipe.2.html) : definition and usage of pipe2 --- @@ -49,5 +54,54 @@ On invoking Youki, main function parses args passed to it, which contains direct From there it matches subcommand arg with possible subcommand and takes appropriate actions, such as creating a new container, deleting a container erc. +### create container + +One thing to note is that in the end, container is just another process in Linux. It has specific/different control group, namespace, using which program executing in it can be given impression that is is running on a complete system, but on the system which it is running, it is just another process, and has attributes such as pid, file descriptors, etc. associated with it like any other process. + +When given create command, Youki will load the specification, configuration, sockets etc. +forks the process into parent an child (C1), forks the child process again (C2), applies the limits, namespaces etc to the child of child (C2)process ,and runs the command/program in the C2. After the command / program is finished the C2 returns. The C1 is waiting for the C2 to exit, after which it also exits. + +### Process + +This handles creation of process and thus the container process. The hierarchy is : +main youki process -> intermediate child process(C1) -> Init Process (C2) + +where -> indicate fork. + +The main youki process sets up the pipe and forks the child process and waits on it to send message and pid of init process using pipe. The child process sets up another pipe for init process, and forks the init process. The init process then notifies the child process that it is ready, which in turn notifies the main youki process that init process is forked and its pid. + +- [mio Token definition](https://docs.rs/mio/0.7.11/mio/struct.Token.html) +- [oom-score-adj](https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9) +- [unshare man page](https://man7.org/linux/man-pages/man1/unshare.1.html) +- [user-namespace man page](https://man7.org/linux/man-pages/man7/user_namespaces.7.html) +- [wait man page](https://man7.org/linux/man-pages/man3/wait.3p.html) + +### Container + +This contains structure represent and functions related to container process and its state and status. + +### Command + +This contains a trait to wrap commonly required syscalls, so that they can be abstracted from implementation details for rest of Youki. +This also provides implementation for Linux syscalls for the trait. + +- [pivot_root man page](https://man7.org/linux/man-pages/man2/pivot_root.2.html) +- [umount2 man page](https://man7.org/linux/man-pages/man2/umount2.2.html) +- [capabilities man page](https://man7.org/linux/man-pages/man7/capabilities.7.html) +- [unshare man page](https://man7.org/linux/man-pages/man2/unshare.2.html) + [oci runtime specification]: https://github.com/opencontainers/runtime-spec/blob/master/runtime.md [runc man pages]: (https://github.com/opencontainers/runc/blob/master/man/runc.8.md) + +## Capabilities + +This has functions related to set and reset specific capabilities, as well as to drop extra privileges + +- [Simple explanation of capabilities](https://blog.container-solutions.com/linux-capabilities-in-practice) +- [man page for capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html) + +## Info + +This is primarily for printing info about system running youki, such as OS release, architecture, cpu info, cgroups info etc. , as this info can be helpful when reporting issues. + +- [about /etc/os-release](https://www.freedesktop.org/software/systemd/man/os-release.html) diff --git a/integration_test.sh b/integration_test.sh index 9e63958680..dac099cdfa 100755 --- a/integration_test.sh +++ b/integration_test.sh @@ -3,10 +3,18 @@ root=$(pwd) cd integration_test/src/github.com/opencontainers/runtime-tools GOPATH=$root/integration_test make runtimetest validation-executables -test_cases=("default/default.t" "linux_cgroups_devices/linux_cgroups_devices.t" "linux_cgroups_hugetlb/linux_cgroups_hugetlb.t" "linux_cgroups_pids/linux_cgroups_pids.t" "linux_cgroups_memory/linux_cgroups_memory.t" "linux_cgroups_network/linux_cgroups_network.t") +test_cases=("default/default.t" "linux_cgroups_devices/linux_cgroups_devices.t" "linux_cgroups_hugetlb/linux_cgroups_hugetlb.t" +"linux_cgroups_pids/linux_cgroups_pids.t" "linux_cgroups_memory/linux_cgroups_memory.t" "linux_cgroups_network/linux_cgroups_network.t" +"linux_cgroups_cpus/linux_cgroups_cpus.t" "linux_cgroups_relative_cpus/linux_cgroups_relative_cpus.t" +"linux_cgroups_relative_devices/linux_cgroups_relative_devices.t" "linux_cgroups_relative_hugetlb/linux_cgroups_relative_hugetlb.t" +"linux_cgroups_relative_memory/linux_cgroups_relative_memory.t" "linux_cgroups_relative_network/linux_cgroups_relative_network.t" +"linux_cgroups_relative_pids/linux_cgroups_relative_pids.t" "create/create.t" "kill/kill.t" "delete/delete.t" "state/state.t") +# Record the tests that runc also fails to pass below, maybe we will fix this by origin integration test, issue: https://github.com/containers/youki/issues/56 +# no_paas_test_case=("start/start.t") for case in "${test_cases[@]}"; do echo "Running $case" - if [ 0 -ne $(sudo RUST_BACKTRACE=1 RUNTIME=$root/target/x86_64-unknown-linux-gnu/debug/youki $root/integration_test/src/github.com/opencontainers/runtime-tools/validation/$case | grep "not ok" | wc -l) ]; then + if [ 0 -ne $(sudo RUST_BACKTRACE=1 YOUKI_LOG_LEVEL=debug RUNTIME=$root/youki $root/integration_test/src/github.com/opencontainers/runtime-tools/validation/$case | grep "not ok" | wc -l) ]; then exit 1 fi + sleep 1 done diff --git a/oci_spec/.gitignore b/oci_spec/.gitignore new file mode 100644 index 0000000000..ea8c4bf7f3 --- /dev/null +++ b/oci_spec/.gitignore @@ -0,0 +1 @@ +/target diff --git a/oci_spec/Cargo.lock b/oci_spec/Cargo.lock index ae59a6f4f4..f001f72c59 100644 --- a/oci_spec/Cargo.lock +++ b/oci_spec/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.40" @@ -37,6 +46,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "env_logger" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f" +dependencies = [ + "log", + "regex", +] + [[package]] name = "errno" version = "0.2.7" @@ -64,6 +83,17 @@ version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "itoa" version = "0.4.7" @@ -76,6 +106,21 @@ version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" + [[package]] name = "nix" version = "0.19.1" @@ -95,6 +140,7 @@ dependencies = [ "anyhow", "caps", "nix", + "quickcheck", "serde", "serde_json", ] @@ -108,6 +154,17 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "env_logger", + "log", + "rand", +] + [[package]] name = "quote" version = "1.0.9" @@ -117,6 +174,41 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" +dependencies = [ + "getrandom", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + [[package]] name = "ryu" version = "1.0.5" @@ -191,6 +283,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + [[package]] name = "winapi" version = "0.3.9" diff --git a/oci_spec/Cargo.toml b/oci_spec/Cargo.toml index 5c9438a88b..7bdbb9be6f 100644 --- a/oci_spec/Cargo.toml +++ b/oci_spec/Cargo.toml @@ -3,10 +3,14 @@ name = "oci_spec" version = "0.1.0" edition = "2018" +[features] +default = [] +proptests = ["quickcheck"] + [dependencies] serde = { version = "1.0", features = ["derive"] } nix = "0.19.1" anyhow = "1.0" serde_json = "1.0" caps = "0.5.1" - +quickcheck = { version = "1", optional = true } diff --git a/oci_spec/src/lib.rs b/oci_spec/src/lib.rs index 9e228abbda..3dad07fd5f 100644 --- a/oci_spec/src/lib.rs +++ b/oci_spec/src/lib.rs @@ -1,609 +1,79 @@ -use nix::sys::stat::SFlag; +use anyhow::{bail, Context, Result}; +use caps::Capability; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::convert::TryFrom; use std::fs::File; -use std::path::PathBuf; - -use anyhow::{bail, Result}; -use serde::{Deserialize, Serialize}; - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct Platform { - #[serde(default)] - pub os: String, - #[serde(default)] - pub arch: String, -} - -#[derive(Default, PartialEq, Serialize, Deserialize, Debug, Clone)] -pub struct Box { - #[serde(default)] - pub height: u64, - #[serde(default)] - pub width: u64, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct User { - #[serde(default)] - pub uid: u32, - #[serde(default)] - pub gid: u32, - #[serde(default)] - pub additional_gids: Vec, - #[serde(default)] - pub username: String, -} - -#[derive(Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct Process { - #[serde(default)] - pub terminal: bool, - #[serde(default)] - pub console_size: Box, - pub user: User, - pub args: Vec, - #[serde(default)] - pub env: Vec, - #[serde(default)] - pub cwd: String, - #[serde(default)] - pub no_new_privileges: bool, - #[serde(default)] - pub apparmor_profile: String, - #[serde(default)] - pub selinux_label: String, - #[serde(default, deserialize_with = "deserialize_caps")] - pub capabilities: Option, - #[serde(default)] - pub rlimits: Vec, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxRlimit { - #[serde(rename = "type")] - pub typ: LinuxRlimitType, - #[serde(default)] - pub hard: u64, - #[serde(default)] - pub soft: u64, -} -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "SCREAMING_SNAKE_CASE")] -pub enum LinuxRlimitType { - RlimitCpu, - RlimitFsize, - RlimitData, - RlimitStack, - RlimitCore, - RlimitRss, - RlimitNproc, - RlimitNofile, - RlimitMemlock, - RlimitAs, - RlimitLocks, - RlimitSigpending, - RlimitMsgqueue, - RlimitNice, - RlimitRtprio, - RlimitRttime, -} - -use caps::Capability; -#[derive(Debug, Clone)] -pub struct LinuxCapabilityType { - pub cap: Capability, -} - -impl<'de> Deserialize<'de> for LinuxCapabilityType { - fn deserialize(desirializer: D) -> Result - where - D: serde::de::Deserializer<'de>, - { - let r: serde_json::Value = serde::Deserialize::deserialize(desirializer)?; - match r { - serde_json::Value::String(type_string) => { - let cap = match type_string.as_str() { - "CAP_CHOWN" => Capability::CAP_CHOWN, - "CAP_DAC_OVERRIDE" => Capability::CAP_DAC_OVERRIDE, - "CAP_DAC_READ_SEARCH" => Capability::CAP_DAC_READ_SEARCH, - "CAP_FOWNER" => Capability::CAP_FOWNER, - "CAP_FSETID" => Capability::CAP_FSETID, - "CAP_KILL" => Capability::CAP_KILL, - "CAP_SETGID" => Capability::CAP_SETGID, - "CAP_SETUID" => Capability::CAP_SETUID, - "CAP_SETPCAP" => Capability::CAP_SETPCAP, - "CAP_LINUX_IMMUTABLE" => Capability::CAP_LINUX_IMMUTABLE, - "CAP_NET_BIND_SERVICE" => Capability::CAP_NET_BIND_SERVICE, - "CAP_NET_BROADCAST" => Capability::CAP_NET_BROADCAST, - "CAP_NET_ADMIN" => Capability::CAP_NET_ADMIN, - "CAP_NET_RAW" => Capability::CAP_NET_RAW, - "CAP_IPC_LOCK" => Capability::CAP_IPC_LOCK, - "CAP_IPC_OWNER" => Capability::CAP_IPC_OWNER, - "CAP_SYS_MODULE" => Capability::CAP_SYS_MODULE, - "CAP_SYS_RAWIO" => Capability::CAP_SYS_RAWIO, - "CAP_SYS_CHROOT" => Capability::CAP_SYS_CHROOT, - "CAP_SYS_PTRACE" => Capability::CAP_SYS_PTRACE, - "CAP_SYS_PACCT" => Capability::CAP_SYS_PACCT, - "CAP_SYS_ADMIN" => Capability::CAP_SYS_ADMIN, - "CAP_SYS_BOOT" => Capability::CAP_SYS_BOOT, - "CAP_SYS_NICE" => Capability::CAP_SYS_NICE, - "CAP_SYS_RESOURCE" => Capability::CAP_SYS_RESOURCE, - "CAP_SYS_TIME" => Capability::CAP_SYS_TIME, - "CAP_SYS_TTYCONFIG" => Capability::CAP_SYS_TTY_CONFIG, - "CAP_SYSLOG" => Capability::CAP_SYSLOG, - "CAP_MKNOD" => Capability::CAP_MKNOD, - "CAP_LEASE" => Capability::CAP_LEASE, - "CAP_AUDIT_WRITE" => Capability::CAP_AUDIT_WRITE, - "CAP_AUDIT_CONTROL" => Capability::CAP_AUDIT_CONTROL, - "CAP_AUDIT_READ" => Capability::CAP_AUDIT_READ, - "CAP_SETFCAP" => Capability::CAP_SETFCAP, - "CAP_MAC_OVERRIDE" => Capability::CAP_MAC_OVERRIDE, - "CAP_MAC_ADMIN" => Capability::CAP_MAC_ADMIN, - "CAP_WAKE_ALARM" => Capability::CAP_WAKE_ALARM, - "CAP_BLOCK_SUSPEND" => Capability::CAP_BLOCK_SUSPEND, - unknown_cap => { - return Err(serde::de::Error::custom(format!( - "{:?} is unexpected type in capabilites", - unknown_cap - ))) - } - }; - Ok(LinuxCapabilityType { cap }) - } - _ => Err(serde::de::Error::custom("Unexpected type in capabilites")), - } - } -} - -#[derive(Deserialize, Debug, Clone)] -pub struct LinuxCapabilities { - #[serde(skip_serializing_if = "Vec::is_empty")] - pub bounding: Vec, - #[serde(skip_serializing_if = "Vec::is_empty")] - pub effective: Vec, - #[serde(skip_serializing_if = "Vec::is_empty")] - pub inheritable: Vec, - #[serde(skip_serializing_if = "Vec::is_empty")] - pub permitted: Vec, - #[serde(skip_serializing_if = "Vec::is_empty")] - pub ambient: Vec, -} - -fn deserialize_caps<'de, D>(desirializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let r: serde_json::Value = serde::Deserialize::deserialize(desirializer)?; - match r { - serde_json::Value::Null => Ok(None), - serde_json::Value::Array(a) => { - let caps = cap_from_array::(&a)?; - let capabilities = LinuxCapabilities { - bounding: caps.clone(), - effective: caps.clone(), - inheritable: caps.clone(), - permitted: caps.clone(), - ambient: caps, - }; - - Ok(Some(capabilities)) - } - serde_json::Value::Object(o) => { - let capabilities = LinuxCapabilities { - bounding: cap_from_object::(&o, "bounding")?, - effective: cap_from_object::(&o, "effective")?, - inheritable: cap_from_object::(&o, "inheritable")?, - permitted: cap_from_object::(&o, "permitted")?, - ambient: cap_from_object::(&o, "ambient")?, - }; - - Ok(Some(capabilities)) - } - _ => Err(serde::de::Error::custom("Unexpected value in capabilites")), - } -} - -fn cap_from_object<'de, D>( - o: &serde_json::Map, - key: &str, -) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - if let Some(v) = o.get(key) { - match *v { - serde_json::Value::Null => Ok(Vec::new()), - serde_json::Value::Array(ref a) => cap_from_array::(a), - _ => Err(serde::de::Error::custom( - "Unexpected value in capability set", - )), - } - } else { - Ok(Vec::new()) - } -} - -fn cap_from_array<'de, D>(a: &[serde_json::Value]) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let mut caps = Vec::new(); - for c in a { - match LinuxCapabilityType::deserialize(c) { - Ok(val) => caps.push(val), - Err(_) => { - let msg = format!("Capability '{}' is not valid", c); - return Err(serde::de::Error::custom(msg)); - } - } - } - Ok(caps) -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct Root { - #[serde(default)] - pub path: PathBuf, - #[serde(default)] - pub readonly: bool, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct Mount { - #[serde(default)] - pub destination: PathBuf, - #[serde(default, rename = "type")] - pub typ: String, - #[serde(default)] - pub source: PathBuf, - #[serde(default)] - pub options: Vec, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxIdMapping { - #[serde(default, rename = "hostID")] - pub host_id: u32, - #[serde(default, rename = "containerID")] - pub container_id: u32, - #[serde(default)] - pub size: u32, -} - -// a is for LinuxDeviceCgroup -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "lowercase")] -pub enum LinuxDeviceType { - B, - C, - U, - P, - A, -} - -impl Default for LinuxDeviceType { - fn default() -> LinuxDeviceType { - LinuxDeviceType::A - } -} - -impl LinuxDeviceType { - pub fn to_sflag(&self) -> Result { - Ok(match self { - Self::B => SFlag::S_IFBLK, - Self::C | LinuxDeviceType::U => SFlag::S_IFCHR, - Self::P => SFlag::S_IFIFO, - Self::A => bail!("type a is not allowed for linux device"), - }) - } - - pub fn as_str(&self) -> &str { - match self { - Self::B => "b", - Self::C => "c", - Self::U => "u", - Self::P => "p", - Self::A => "a", - } - } -} +use std::path::{Path, PathBuf}; -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxDeviceCgroup { - #[serde(default)] - pub allow: bool, - #[serde(default, rename = "type")] - pub typ: LinuxDeviceType, - pub major: Option, - pub minor: Option, - #[serde(default)] - pub access: String, -} - -impl ToString for LinuxDeviceCgroup { - fn to_string(&self) -> String { - let major = self - .major - .map(|mj| mj.to_string()) - .unwrap_or_else(|| "*".to_string()); - let minor = self - .minor - .map(|mi| mi.to_string()) - .unwrap_or_else(|| "*".to_string()); - format!( - "{} {}:{} {}", - self.typ.as_str(), - &major, - &minor, - &self.access - ) - } -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxMemory { - pub limit: Option, - pub reservation: Option, - pub swap: Option, - pub kernel: Option, - #[serde(rename = "kernelTCP")] - pub kernel_tcp: Option, - pub swappiness: Option, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxCpu { - pub shares: Option, - pub quota: Option, - pub period: Option, - pub realtime_runtime: Option, - pub realtime_period: Option, - #[serde(default)] - pub cpus: String, - #[serde(default)] - pub mems: String, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxPids { - #[serde(default)] - pub limit: i64, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxWeightDevice { - #[serde(default)] - pub major: i64, - #[serde(default)] - pub minor: i64, - pub weight: Option, - pub leaf_weight: Option, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxThrottleDevice { - #[serde(default)] - pub major: i64, - #[serde(default)] - pub minor: i64, - #[serde(default)] - pub rate: u64, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxBlockIo { - pub blkio_weight: Option, - pub blkio_leaf_weight: Option, - #[serde(default)] - pub blkio_weight_device: Vec, - #[serde(default, rename = "throttleReadBpsDevice")] - pub blkio_throttle_read_bps_device: Vec, - #[serde(default, rename = "throttleWriteBpsDevice")] - pub blkio_throttle_write_bps_device: Vec, - #[serde(default, rename = "throttleReadIOPSDevice")] - pub blkio_throttle_read_iops_device: Vec, - #[serde(default, rename = "throttleWriteIOPSDevice")] - pub blkio_throttle_write_iops_device: Vec, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxHugepageLimit { - #[serde(default)] - pub page_size: String, - #[serde(default)] - pub limit: i64, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxInterfacePriority { - #[serde(default)] - pub name: String, - #[serde(default)] - pub priority: u32, -} - -impl ToString for LinuxInterfacePriority { - fn to_string(&self) -> String { - format!("{} {}\n", self.name, self.priority) - } -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxNetwork { - #[serde(rename = "classID")] - pub class_id: Option, - #[serde(default)] - pub priorities: Vec, -} - -#[derive(Default, Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxResources { - #[serde(default)] - pub devices: Vec, - #[serde(default)] - pub disable_oom_killer: bool, - pub oom_score_adj: Option, - pub memory: Option, - #[serde(rename = "LinuxCPU")] - pub cpu: Option, - pub pids: Option, - #[serde(rename = "blockIO")] - pub block_io: Option, - #[serde(default)] - pub hugepage_limits: Vec, - pub network: Option, -} - -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "snake_case")] -pub enum LinuxNamespaceType { - Mount = 0x00020000, - Cgroup = 0x02000000, - Uts = 0x04000000, - Ipc = 0x08000000, - User = 0x10000000, - Pid = 0x20000000, - Network = 0x40000000, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LinuxNamespace { - #[serde(rename = "type")] - pub typ: LinuxNamespaceType, - pub path: Option, -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct LinuxDevice { - #[serde(default)] - pub path: PathBuf, - #[serde(rename = "type")] - pub typ: LinuxDeviceType, - #[serde(default)] - pub major: u64, - #[serde(default)] - pub minor: u64, - pub file_mode: Option, - pub uid: Option, - pub gid: Option, -} - -impl From<&LinuxDevice> for LinuxDeviceCgroup { - fn from(linux_device: &LinuxDevice) -> LinuxDeviceCgroup { - LinuxDeviceCgroup { - allow: true, - typ: linux_device.typ, - major: Some(linux_device.major as i64), - minor: Some(linux_device.minor as i64), - access: "rwm".to_string(), - } - } -} - -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "SCREAMING_SNAKE_CASE")] -#[repr(u32)] -pub enum LinuxSeccompAction { - ScmpActKill = 0x00000000, - ScmpActTrap = 0x00030000, - ScmpActErrno = 0x00050001, - ScmpActTrace = 0x7ff00001, - ScmpActAllow = 0x7fff0000, -} - -#[allow(clippy::enum_clike_unportable_variant)] -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "SCREAMING_SNAKE_CASE")] -pub enum Arch { - ScmpArchNative = 0x00000000, - ScmpArchX86 = 0x40000003, - ScmpArchX86_64 = 0xc000003e, - ScmpArchX32 = 0x4000003e, - ScmpArchArm = 0x40000028, - ScmpArchAarch64 = 0xc00000b7, - ScmpArchMips = 0x00000008, - ScmpArchMips64 = 0x80000008, - ScmpArchMips64n32 = 0xa0000008, - ScmpArchMipsel = 0x40000008, - ScmpArchMipsel64 = 0xc0000008, - ScmpArchMipsel64n32 = 0xe0000008, - ScmpArchPpc = 0x00000014, - ScmpArchPpc64 = 0x80000015, - ScmpArchPpc64le = 0xc0000015, - ScmpArchS390 = 0x00000016, - ScmpArchS390x = 0x80000016, -} - -#[derive(Serialize, Deserialize, Debug, Clone, Copy)] -#[serde(rename_all = "SCREAMING_SNAKE_CASE")] -#[repr(u32)] -pub enum LinuxSeccompOperator { - ScmpCmpNe = 1, - ScmpCmpLt = 2, - ScmpCmpLe = 3, - ScmpCmpEq = 4, - ScmpCmpGe = 5, - ScmpCmpGt = 6, - ScmpCmpMaskedEq = 7, -} +mod linux; +mod miscellaneous; +mod process; +mod test; -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(rename_all = "camelCase")] -pub struct Linux { - #[serde(default, rename = "LinuxIDMapping")] - pub uid_mappings: Vec, - #[serde(default, rename = "LinuxIDMapping")] - pub gid_mappings: Vec, - #[serde(default)] - pub sysctl: HashMap, - pub resources: Option, - #[serde(default)] - pub cgroups_path: PathBuf, - #[serde(default)] - pub namespaces: Vec, - #[serde(default)] - pub devices: Vec, - #[serde(default)] - pub rootfs_propagation: String, - #[serde(default)] - pub masked_paths: Vec, - #[serde(default)] - pub readonly_paths: Vec, - #[serde(default)] - pub mount_label: String, -} +// re-export for ease of use +pub use linux::*; +pub use miscellaneous::*; +pub use process::*; -#[derive(Deserialize, Debug, Clone)] +// Base configuration for the container +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct Spec { + // Version of the Open Container Initiative Runtime Specification with which the bundle complies #[serde(default, rename = "ociVersion")] pub version: String, + // Computer os and arch pub platform: Option, + // Configures container process pub process: Process, + // Configures container's root filesystem pub root: Root, + // Configures container's hostname #[serde(default)] pub hostname: String, + // Configures additional mounts (on top of Root) #[serde(default)] pub mounts: Vec, + // Arbitrary metadata for container #[serde(default)] pub annotations: HashMap, + // Platform specific config for Linux based containers pub linux: Option, } +// This gives a basic boilerplate for Spec that can be used calling Default::default(). +// The values given are similar to the defaults seen in docker and runc, it creates a containerized shell! +// (see respective types default impl for more info) +impl Default for Spec { + fn default() -> Self { + Spec { + // Defaults to most current oci version + version: String::from("1.0.2-dev"), + platform: Some(Default::default()), + process: Default::default(), + root: Default::default(), + // Defaults hostname as youki + hostname: String::from("youki"), + mounts: get_default_mounts(), + // Defaults to empty metadata + annotations: Default::default(), + linux: Some(Default::default()), + } + } +} + impl Spec { - pub fn load(path: &str) -> Result { - let file = File::open(path)?; - let mut spec: Spec = serde_json::from_reader(&file)?; - spec.root.path = std::fs::canonicalize(spec.root.path)?; + pub fn load>(path: P) -> Result { + let path = path.as_ref(); + let file = + File::open(path).with_context(|| format!("load spec: failed to open {:?}", path))?; + let spec: Spec = serde_json::from_reader(&file)?; Ok(spec) } + + pub fn canonicalize_rootfs(&mut self) -> Result<()> { + self.root.path = std::fs::canonicalize(&self.root.path) + .with_context(|| format!("failed to canonicalize {:?}", self.root.path))?; + Ok(()) + } } diff --git a/oci_spec/src/linux.rs b/oci_spec/src/linux.rs new file mode 100644 index 0000000000..47b9929e9d --- /dev/null +++ b/oci_spec/src/linux.rs @@ -0,0 +1,559 @@ +use super::*; +use nix::sys::stat::SFlag; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct Linux { + // UIDMapping for supporting user namespaces + #[serde(default, rename = "uidMappings")] + pub uid_mappings: Vec, + // GIDMapping for supporting group namespaces + #[serde(default, rename = "gidMappings")] + pub gid_mappings: Vec, + // Sysctl that are set for container on start + #[serde(default)] + pub sysctl: HashMap, + // Resources contain cgroup info for handling resource constraints + #[serde(default)] + pub resources: Option, + // CgroupsPath specifies the path to cgroups that are created and/or joined by the container. + // The path is expected to be relative to the cgroups mountpoint. + // If resources are specified, the cgroups at CgroupsPath will be updated based on resources. + #[serde(default)] + pub cgroups_path: Option, + // Namespaces contains the namespaces that are created and/or joined by the container + #[serde(default)] + pub namespaces: Vec, + // Devices are a list of device nodes that are created for the container + #[serde(default)] + pub devices: Vec, + // The rootfs mount propagation mode for the container + #[serde(default)] + pub rootfs_propagation: String, + // Masks over the provided paths inside the container so they cannot be read + #[serde(default)] + pub masked_paths: Vec, + // Sets the provided paths as RO inside the container + #[serde(default)] + pub readonly_paths: Vec, + // Specifies th selinux context for the mounts in the container + #[serde(default)] + pub mount_label: String, +} + +// Default impl for Linux (see funtions for more info) +impl Default for Linux { + fn default() -> Self { + Linux { + // Creates empty Vec + uid_mappings: Default::default(), + // Creates empty Vec + gid_mappings: Default::default(), + // Empty sysctl Hashmap + sysctl: Default::default(), + resources: Some(LinuxResources { + devices: vec![LinuxDeviceCgroup { + access: "rwm".to_string(), + allow: false, + typ: Default::default(), + major: Default::default(), + minor: Default::default(), + }], + disable_oom_killer: Default::default(), + oom_score_adj: Default::default(), + memory: Default::default(), + cpu: Default::default(), + pids: Default::default(), + block_io: Default::default(), + hugepage_limits: Default::default(), + network: Default::default(), + freezer: Default::default(), + }), + // Defaults to None + cgroups_path: Default::default(), + namespaces: get_default_namespaces(), + // Empty Vec + devices: Default::default(), + // Empty String + rootfs_propagation: Default::default(), + masked_paths: get_default_maskedpaths(), + readonly_paths: get_default_readonly_paths(), + // Empty String + mount_label: Default::default(), + } + } +} + +// Specifies UID/GID mappings +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxIdMapping { + // Starting uid/gid on the host to be mapped to container_id + #[serde(default, rename = "hostID")] + pub host_id: u32, + // Starting uid/gid in the container + #[serde(default, rename = "containerID")] + pub container_id: u32, + // Number of IDs to be mapped + #[serde(default)] + pub size: u32, +} + +// Device types +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "lowercase")] +pub enum LinuxDeviceType { + // block (buffered) + B, + // character (unbuffered) + C, + // character (unbufferd) + U, + // FIFO + P, + // ?? + A, +} + +impl Default for LinuxDeviceType { + fn default() -> LinuxDeviceType { + LinuxDeviceType::A + } +} + +impl LinuxDeviceType { + pub fn to_sflag(&self) -> Result { + Ok(match self { + Self::B => SFlag::S_IFBLK, + Self::C | LinuxDeviceType::U => SFlag::S_IFCHR, + Self::P => SFlag::S_IFIFO, + Self::A => bail!("type a is not allowed for linux device"), + }) + } + + pub fn as_str(&self) -> &str { + match self { + Self::B => "b", + Self::C => "c", + Self::U => "u", + Self::P => "p", + Self::A => "a", + } + } +} + +// Represents a device rule for the devices specified to the device controller +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] +pub struct LinuxDeviceCgroup { + // allow or deny + #[serde(default)] + pub allow: bool, + // Device type, block, char, etc. + #[serde(default, rename = "type")] + pub typ: LinuxDeviceType, + // Device's major number + pub major: Option, + // Device's minor number + pub minor: Option, + // Cgroup access premissions format, rwm. + #[serde(default)] + pub access: String, +} + +impl ToString for LinuxDeviceCgroup { + fn to_string(&self) -> String { + let major = self + .major + .map(|mj| mj.to_string()) + .unwrap_or_else(|| "*".to_string()); + let minor = self + .minor + .map(|mi| mi.to_string()) + .unwrap_or_else(|| "*".to_string()); + format!( + "{} {}:{} {}", + self.typ.as_str(), + &major, + &minor, + &self.access + ) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxMemory { + pub limit: Option, + pub reservation: Option, + pub swap: Option, + pub kernel: Option, + #[serde(rename = "kernelTCP")] + pub kernel_tcp: Option, + pub swappiness: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxCpu { + pub shares: Option, + pub quota: Option, + pub period: Option, + pub realtime_runtime: Option, + pub realtime_period: Option, + #[serde(default)] + pub cpus: Option, + #[serde(default)] + pub mems: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxPids { + #[serde(default)] + pub limit: i64, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxWeightDevice { + #[serde(default)] + pub major: i64, + #[serde(default)] + pub minor: i64, + pub weight: Option, + pub leaf_weight: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxThrottleDevice { + #[serde(default)] + pub major: i64, + #[serde(default)] + pub minor: i64, + #[serde(default)] + pub rate: u64, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxBlockIo { + pub blkio_weight: Option, + pub blkio_leaf_weight: Option, + #[serde(default)] + pub blkio_weight_device: Vec, + #[serde(default, rename = "throttleReadBpsDevice")] + pub blkio_throttle_read_bps_device: Vec, + #[serde(default, rename = "throttleWriteBpsDevice")] + pub blkio_throttle_write_bps_device: Vec, + #[serde(default, rename = "throttleReadIOPSDevice")] + pub blkio_throttle_read_iops_device: Vec, + #[serde(default, rename = "throttleWriteIOPSDevice")] + pub blkio_throttle_write_iops_device: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxHugepageLimit { + #[serde(default)] + pub page_size: String, + #[serde(default)] + pub limit: i64, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxInterfacePriority { + #[serde(default)] + pub name: String, + #[serde(default)] + pub priority: u32, +} + +impl ToString for LinuxInterfacePriority { + fn to_string(&self) -> String { + format!("{} {}\n", self.name, self.priority) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxNetwork { + #[serde(rename = "classID")] + pub class_id: Option, + #[serde(default)] + pub priorities: Vec, +} + +// Resource constraints for container +#[derive(Default, Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxResources { + // Devices configures the device allow list + #[serde(default)] + pub devices: Vec, + // Disables the OOM killer for out of memory conditions + #[serde(default)] + pub disable_oom_killer: bool, + // Specify an oom_score_adj for container + pub oom_score_adj: Option, + // Memory usage restrictions + pub memory: Option, + // CPU resource restrictions + pub cpu: Option, + // Task resource restrictions + pub pids: Option, + // BlockIO restrictions + #[serde(rename = "blockIO")] + pub block_io: Option, + // Hugelb restrictions + #[serde(default)] + pub hugepage_limits: Vec, + // Network usage restrictions + pub network: Option, + pub freezer: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum LinuxNamespaceType { + // Mount Namespace for isolating mount points + Mount = 0x00020000, + // Cgroup Namespace for isolating cgroup hierarchies + Cgroup = 0x02000000, + // Uts Namespace for isolating hostname and NIS domain name + Uts = 0x04000000, + // Ipc Namespace for isolating System V, IPC, POSIX message queues + Ipc = 0x08000000, + // User Namespace for isolating user and group ids + User = 0x10000000, + // PID Namespace for isolating process ids + Pid = 0x20000000, + // Network Namespace for isolating network devices, ports, stacks etc. + Network = 0x40000000, +} + +impl TryFrom<&str> for LinuxNamespaceType { + type Error = anyhow::Error; + + fn try_from(namespace: &str) -> Result { + match namespace { + "mnt" => Ok(LinuxNamespaceType::Mount), + "cgroup" => Ok(LinuxNamespaceType::Cgroup), + "uts" => Ok(LinuxNamespaceType::Uts), + "ipc" => Ok(LinuxNamespaceType::Ipc), + "user" => Ok(LinuxNamespaceType::User), + "pid" => Ok(LinuxNamespaceType::Pid), + "net" => Ok(LinuxNamespaceType::Network), + _ => bail!("unknown namespace {}, could not convert", namespace), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxNamespace { + #[serde(rename = "type")] + pub typ: LinuxNamespaceType, + pub path: Option, +} + +// Utility function to get default namespaces +pub fn get_default_namespaces() -> Vec { + let mut default_namespace = Vec::new(); + default_namespace.push(LinuxNamespace { + typ: LinuxNamespaceType::Pid, + path: Default::default(), + }); + default_namespace.push(LinuxNamespace { + typ: LinuxNamespaceType::Network, + path: Default::default(), + }); + default_namespace.push(LinuxNamespace { + typ: LinuxNamespaceType::Ipc, + path: Default::default(), + }); + default_namespace.push(LinuxNamespace { + typ: LinuxNamespaceType::Uts, + path: Default::default(), + }); + default_namespace.push(LinuxNamespace { + typ: LinuxNamespaceType::Mount, + path: Default::default(), + }); + default_namespace +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct LinuxDevice { + #[serde(default)] + pub path: PathBuf, + #[serde(rename = "type")] + pub typ: LinuxDeviceType, + #[serde(default)] + pub major: u64, + #[serde(default)] + pub minor: u64, + pub file_mode: Option, + pub uid: Option, + pub gid: Option, +} + +impl From<&LinuxDevice> for LinuxDeviceCgroup { + fn from(linux_device: &LinuxDevice) -> LinuxDeviceCgroup { + LinuxDeviceCgroup { + allow: true, + typ: linux_device.typ, + major: Some(linux_device.major as i64), + minor: Some(linux_device.minor as i64), + access: "rwm".to_string(), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +#[repr(u32)] +pub enum LinuxSeccompAction { + ScmpActKill = 0x00000000, + ScmpActTrap = 0x00030000, + ScmpActErrno = 0x00050001, + ScmpActTrace = 0x7ff00001, + ScmpActAllow = 0x7fff0000, +} + +#[allow(clippy::enum_clike_unportable_variant)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum Arch { + ScmpArchNative = 0x00000000, + ScmpArchX86 = 0x40000003, + ScmpArchX86_64 = 0xc000003e, + ScmpArchX32 = 0x4000003e, + ScmpArchArm = 0x40000028, + ScmpArchAarch64 = 0xc00000b7, + ScmpArchMips = 0x00000008, + ScmpArchMips64 = 0x80000008, + ScmpArchMips64n32 = 0xa0000008, + ScmpArchMipsel = 0x40000008, + ScmpArchMipsel64 = 0xc0000008, + ScmpArchMipsel64n32 = 0xe0000008, + ScmpArchPpc = 0x00000014, + ScmpArchPpc64 = 0x80000015, + ScmpArchPpc64le = 0xc0000015, + ScmpArchS390 = 0x00000016, + ScmpArchS390x = 0x80000016, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +#[repr(u32)] +pub enum LinuxSeccompOperator { + ScmpCmpNe = 1, + ScmpCmpLt = 2, + ScmpCmpLe = 3, + ScmpCmpEq = 4, + ScmpCmpGe = 5, + ScmpCmpGt = 6, + ScmpCmpMaskedEq = 7, +} + +// Default masks paths, cannot read these host files +pub fn get_default_maskedpaths() -> Vec { + vec![ + // For example now host interfaces such as + // bluetooth cannot be accessed due to /proc/acpi + "/proc/acpi".to_string(), + "/proc/asound".to_string(), + "/proc/kcore".to_string(), + "/proc/keys".to_string(), + "/proc/latency_stats".to_string(), + "/proc/timer_list".to_string(), + "/proc/timer_stats".to_string(), + "/proc/sched_debug".to_string(), + "/sys/firmware".to_string(), + "/proc/scsi".to_string(), + ] +} + +// Default readonly paths, +// For example most containers shouldn't have permission to write to /proc/sys +pub fn get_default_readonly_paths() -> Vec { + vec![ + "/proc/bus".to_string(), + "/proc/fs".to_string(), + "/proc/irq".to_string(), + "/proc/sys".to_string(), + "/proc/sysrq-trigger".to_string(), + ] +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum FreezerState { + Undefined, + Frozen, + Thawed, +} + +#[cfg(feature = "proptests")] +use quickcheck::{Arbitrary, Gen}; + +#[cfg(feature = "proptests")] +fn some_none_generator_util(g: &mut Gen) -> Option { + let choice = g.choose(&[true, false]).unwrap(); + match choice { + false => None, + true => Some(T::arbitrary(g)), + } +} + +#[cfg(feature = "proptests")] +impl Arbitrary for LinuxDeviceCgroup { + fn arbitrary(g: &mut Gen) -> LinuxDeviceCgroup { + let typ_choices = ["b", "c", "u", "p", "a"]; + + let typ_chosen = g.choose(&typ_choices).unwrap(); + + let typ = match typ_chosen.to_string().as_str() { + "b" => LinuxDeviceType::B, + "c" => LinuxDeviceType::C, + "u" => LinuxDeviceType::U, + "p" => LinuxDeviceType::P, + "a" => LinuxDeviceType::A, + _ => LinuxDeviceType::A, + }; + + let access_choices = ["rwm", "m"]; + LinuxDeviceCgroup { + allow: bool::arbitrary(g), + typ, + major: some_none_generator_util::(g), + minor: some_none_generator_util::(g), + access: g.choose(&access_choices).unwrap().to_string(), + } + } +} + +#[cfg(feature = "proptests")] +impl Arbitrary for LinuxMemory { + fn arbitrary(g: &mut Gen) -> LinuxMemory { + LinuxMemory { + kernel: some_none_generator_util::(g), + kernel_tcp: some_none_generator_util::(g), + limit: some_none_generator_util::(g), + reservation: some_none_generator_util::(g), + swap: some_none_generator_util::(g), + swappiness: some_none_generator_util::(g), + } + } +} + +#[cfg(feature = "proptests")] +impl Arbitrary for LinuxHugepageLimit { + fn arbitrary(g: &mut Gen) -> LinuxHugepageLimit { + let unit_choice = ["KB", "MB", "GB"]; + let unit = g.choose(&unit_choice).unwrap(); + let page_size = u64::arbitrary(g).to_string() + unit; + + LinuxHugepageLimit { + page_size, + limit: i64::arbitrary(g), + } + } +} diff --git a/oci_spec/src/miscellaneous.rs b/oci_spec/src/miscellaneous.rs new file mode 100644 index 0000000000..8522acac9b --- /dev/null +++ b/oci_spec/src/miscellaneous.rs @@ -0,0 +1,145 @@ +use super::*; +use std::env; + +// os and architecture of computer +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct Platform { + #[serde(default)] + pub os: String, + #[serde(default)] + pub arch: String, +} + +/// Gets os and arch of system by default +impl Default for Platform { + fn default() -> Self { + Platform { + os: env::consts::OS.to_string(), + arch: env::consts::ARCH.to_string(), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct Root { + // Path to the container's root filesystem + #[serde(default)] + pub path: PathBuf, + // Makes container root file system readonly before process is executed + #[serde(default)] + pub readonly: bool, +} + +// Default path for container root is "./rootfs" from config.json, with readonly true +impl Default for Root { + fn default() -> Self { + Root { + path: PathBuf::from("rootfs"), + readonly: true, + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct Mount { + // Path where mount will be placed in container + #[serde(default)] + pub destination: PathBuf, + // Specifies mount type + #[serde(default, rename = "type")] + pub typ: String, + // source path of mount + #[serde(default)] + pub source: PathBuf, + // mount options (https://man7.org/linux/man-pages/man8/mount.8.html) + #[serde(default)] + pub options: Vec, +} + +// utility function to generate default config for mounts +pub fn get_default_mounts() -> Vec { + let mut default_mounts = Vec::new(); + default_mounts.push(Mount { + destination: PathBuf::from("/proc"), + typ: String::from("proc"), + source: PathBuf::from("proc"), + options: Vec::new(), + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/dev"), + typ: String::from("tmpfs"), + source: PathBuf::from("tmpfs"), + options: vec![ + "nosuid".to_string(), + "strictatime".to_string(), + "mode=755".to_string(), + "size=65536k".to_string(), + ], + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/dev/pts"), + typ: String::from("devpts"), + source: PathBuf::from("devpts"), + options: vec![ + "nosuid".to_string(), + "noexec".to_string(), + "newinstance".to_string(), + "ptmxmode=0666".to_string(), + "mode=0620".to_string(), + "gid=5".to_string(), + ], + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/dev/shm"), + typ: String::from("tmpfs"), + source: PathBuf::from("shm"), + options: vec![ + "nosuid".to_string(), + "noexec".to_string(), + "nodev".to_string(), + "mode=1777".to_string(), + "size=65536k".to_string(), + ], + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/dev/mqueue"), + typ: String::from("mqueue"), + source: PathBuf::from("mqueue"), + options: vec![ + "nosuid".to_string(), + "noexec".to_string(), + "nodev".to_string(), + ], + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/sys"), + typ: String::from("sysfs"), + source: PathBuf::from("sysfs"), + options: vec![ + "nosuid".to_string(), + "noexec".to_string(), + "nodev".to_string(), + "ro".to_string(), + ], + }); + + default_mounts.push(Mount { + destination: PathBuf::from("/sys/fs/cgroup"), + typ: String::from("cgroup"), + source: PathBuf::from("cgroup"), + options: vec![ + "nosuid".to_string(), + "noexec".to_string(), + "nodev".to_string(), + "relatime".to_string(), + "ro".to_string(), + ], + }); + + default_mounts +} diff --git a/oci_spec/src/process.rs b/oci_spec/src/process.rs new file mode 100644 index 0000000000..9c37003b27 --- /dev/null +++ b/oci_spec/src/process.rs @@ -0,0 +1,342 @@ +use super::*; + +// Specifies the container process. This property is used when youki start is called. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct Process { + // Creates interactive terminal for container + #[serde(default)] + pub terminal: bool, + // Specifies size of console + #[serde(default)] + pub console_size: Box, + // User info for process + pub user: User, + // Specifies the binary and arguments for the application to execute + pub args: Vec, + // Populates the process enviroment + #[serde(default)] + pub env: Vec, + // current working directory relative to container root + #[serde(default)] + pub cwd: String, + // Determines whether additional privileges can be gained by process + #[serde(default)] + pub no_new_privileges: bool, + // Apparmor profile for the container + #[serde(default)] + pub apparmor_profile: String, + // Selinux context that the container is run as + #[serde(default)] + pub selinux_label: String, + // Linux capabilities that are kept for the process + #[serde(default)] + pub capabilities: Option, + // RLIMIT options to apply to the process + #[serde(default)] + pub rlimits: Vec, +} + +// Default impl for processes in the container +impl Default for Process { + fn default() -> Self { + Process { + // Creates an interactive terminal for container by default + terminal: true, + // Gives default console size of 0, 0 + console_size: Default::default(), + // Gives process a uid and gid of 0 (root) + user: Default::default(), + // By default executes sh command, giving user shell + args: vec![String::from("sh")], + // Sets linux default enviroment for binaries and default xterm emulator + env: vec![ + String::from("PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"), + String::from("TERM=xterm"), + ], + // Sets cwd of process to the container root by default + cwd: String::from("/"), + // By default does not allow process to gain additional privileges + no_new_privileges: true, + // Empty String, no default apparmor + apparmor_profile: Default::default(), + // Empty String, no default selinux + selinux_label: Default::default(), + // See impl Default for LinuxCapabilities + capabilities: Some(Default::default()), + // Sets the default maximum of 1024 files the process can open + // This is the same as the linux kernel default + rlimits: vec![LinuxRlimit { + typ: LinuxRlimitType::RlimitNofile, + hard: 1024, + soft: 1024, + }], + } + } +} + +// Specifies the size of console +#[derive(Default, PartialEq, Serialize, Deserialize, Debug, Clone)] +pub struct Box { + #[serde(default)] + pub height: u64, + #[serde(default)] + pub width: u64, +} +// RLimit types available in youki (see https://man7.org/linux/man-pages/man2/getrlimit.2.html) +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum LinuxRlimitType { + // Limit in seconds of the amount of CPU time that the process can consume + RlimitCpu, + // Maximum size in bytes of the files that the process creates + RlimitFsize, + // Maximum size of the process's data segment (init data, uninit data and heap) in bytes + RlimitData, + // Maximum size of the proces stack in bytes + RlimitStack, + // Maximum size of a core dump file in bytes + RlimitCore, + // Limit on the process's resident set (the number of virtual pages resident in RAM) + RlimitRss, + // Limit on number of threads for the real uid calling processes + RlimitNproc, + // One greator than the maximum number of file descritors that one process may open + RlimitNofile, + // Maximum number of bytes of memory that may be locked into RAM + RlimitMemlock, + // Maximum size of the process's virtual memory(address space) in bytes + RlimitAs, + // Limit on the number of locks and leases for the process + RlimitLocks, + // Limit on number of signals that may be queued for the process + RlimitSigpending, + // Limit on the number of bytes that can be allocated for POSIX message queue + RlimitMsgqueue, + // Specifies a ceiling to which the process's nice value can be raised + RlimitNice, + // Specifies a ceiling on the real-time priority + RlimitRtprio, + // This is a limit (in microseconds) on the amount of CPU time + // that a process scheduled under a real-time scheduling + // policy may consume without making a blocking system call + RlimitRttime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxRlimit { + // Type of Rlimit to set + #[serde(rename = "type")] + pub typ: LinuxRlimitType, + // Hard limit for specified type + #[serde(default)] + pub hard: u64, + // Soft limit for specified type + #[serde(default)] + pub soft: u64, +} + +// user id (uid) and group id (gid) tracks file permssions +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct User { + // user id + #[serde(default)] + pub uid: u32, + // group id + #[serde(default)] + pub gid: u32, + // additional group ids set for the container's process + #[serde(default)] + pub additional_gids: Vec, + //user name + #[serde(default)] + pub username: String, +} + +// Linux capabilities (see https://man7.org/linux/man-pages/man7/capabilities.7.html) +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Copy)] +#[allow(non_camel_case_types)] +pub enum LinuxCapabilityType { + CAP_CHOWN, + CAP_DAC_OVERRIDE, + CAP_DAC_READ_SEARCH, + CAP_FOWNER, + CAP_FSETID, + CAP_KILL, + CAP_SETGID, + CAP_SETUID, + CAP_SETPCAP, + CAP_LINUX_IMMUTABLE, + CAP_NET_BIND_SERVICE, + CAP_NET_BROADCAST, + CAP_NET_ADMIN, + CAP_NET_RAW, + CAP_IPC_LOCK, + CAP_IPC_OWNER, + CAP_SYS_MODULE, + CAP_SYS_RAWIO, + CAP_SYS_CHROOT, + CAP_SYS_PTRACE, + CAP_SYS_PACCT, + CAP_SYS_ADMIN, + CAP_SYS_BOOT, + CAP_SYS_NICE, + CAP_SYS_RESOURCE, + CAP_SYS_TIME, + CAP_SYS_TTY_CONFIG, + CAP_MKNOD, + CAP_LEASE, + CAP_AUDIT_WRITE, + CAP_AUDIT_CONTROL, + CAP_SETFCAP, + CAP_MAC_OVERRIDE, + CAP_MAC_ADMIN, + CAP_SYSLOG, + CAP_WAKE_ALARM, + CAP_BLOCK_SUSPEND, + CAP_AUDIT_READ, + CAP_PERFMON, + CAP_BPF, + CAP_CHECKPOINT_RESTORE, +} + +// impl Into and From for LinuxCapabilityType +impl From for LinuxCapabilityType { + fn from(cap: Capability) -> Self { + match cap { + Capability::CAP_CHOWN => LinuxCapabilityType::CAP_CHOWN, + Capability::CAP_DAC_OVERRIDE => LinuxCapabilityType::CAP_DAC_OVERRIDE, + Capability::CAP_DAC_READ_SEARCH => LinuxCapabilityType::CAP_DAC_READ_SEARCH, + Capability::CAP_FOWNER => LinuxCapabilityType::CAP_FOWNER, + Capability::CAP_FSETID => LinuxCapabilityType::CAP_FSETID, + Capability::CAP_KILL => LinuxCapabilityType::CAP_KILL, + Capability::CAP_SETGID => LinuxCapabilityType::CAP_SETGID, + Capability::CAP_SETUID => LinuxCapabilityType::CAP_SETUID, + Capability::CAP_SETPCAP => LinuxCapabilityType::CAP_SETPCAP, + Capability::CAP_LINUX_IMMUTABLE => LinuxCapabilityType::CAP_LINUX_IMMUTABLE, + Capability::CAP_NET_BIND_SERVICE => LinuxCapabilityType::CAP_NET_BIND_SERVICE, + Capability::CAP_NET_BROADCAST => LinuxCapabilityType::CAP_NET_BROADCAST, + Capability::CAP_NET_ADMIN => LinuxCapabilityType::CAP_NET_ADMIN, + Capability::CAP_NET_RAW => LinuxCapabilityType::CAP_NET_RAW, + Capability::CAP_IPC_LOCK => LinuxCapabilityType::CAP_IPC_LOCK, + Capability::CAP_IPC_OWNER => LinuxCapabilityType::CAP_IPC_OWNER, + Capability::CAP_SYS_MODULE => LinuxCapabilityType::CAP_SYS_MODULE, + Capability::CAP_SYS_RAWIO => LinuxCapabilityType::CAP_SYS_RAWIO, + Capability::CAP_SYS_CHROOT => LinuxCapabilityType::CAP_SYS_CHROOT, + Capability::CAP_SYS_PTRACE => LinuxCapabilityType::CAP_SYS_PTRACE, + Capability::CAP_SYS_PACCT => LinuxCapabilityType::CAP_SYS_PACCT, + Capability::CAP_SYS_ADMIN => LinuxCapabilityType::CAP_SYS_ADMIN, + Capability::CAP_SYS_BOOT => LinuxCapabilityType::CAP_SYS_BOOT, + Capability::CAP_SYS_NICE => LinuxCapabilityType::CAP_SYS_NICE, + Capability::CAP_SYS_RESOURCE => LinuxCapabilityType::CAP_SYS_RESOURCE, + Capability::CAP_SYS_TIME => LinuxCapabilityType::CAP_SYS_TIME, + Capability::CAP_SYS_TTY_CONFIG => LinuxCapabilityType::CAP_SYS_TTY_CONFIG, + Capability::CAP_SYSLOG => LinuxCapabilityType::CAP_SYSLOG, + Capability::CAP_MKNOD => LinuxCapabilityType::CAP_MKNOD, + Capability::CAP_LEASE => LinuxCapabilityType::CAP_LEASE, + Capability::CAP_AUDIT_WRITE => LinuxCapabilityType::CAP_AUDIT_WRITE, + Capability::CAP_AUDIT_CONTROL => LinuxCapabilityType::CAP_AUDIT_CONTROL, + Capability::CAP_AUDIT_READ => LinuxCapabilityType::CAP_AUDIT_READ, + Capability::CAP_SETFCAP => LinuxCapabilityType::CAP_SETFCAP, + Capability::CAP_MAC_OVERRIDE => LinuxCapabilityType::CAP_MAC_OVERRIDE, + Capability::CAP_MAC_ADMIN => LinuxCapabilityType::CAP_MAC_ADMIN, + Capability::CAP_WAKE_ALARM => LinuxCapabilityType::CAP_WAKE_ALARM, + Capability::CAP_BLOCK_SUSPEND => LinuxCapabilityType::CAP_BLOCK_SUSPEND, + Capability::CAP_PERFMON => LinuxCapabilityType::CAP_PERFMON, + Capability::CAP_BPF => LinuxCapabilityType::CAP_BPF, + Capability::CAP_CHECKPOINT_RESTORE => LinuxCapabilityType::CAP_CHECKPOINT_RESTORE, + Capability::__Nonexhaustive => unreachable!("unexpected Linux Capability Type"), + } + } +} + +// impl Into and From for caps::Capability +impl From for Capability { + fn from(linux_cap: LinuxCapabilityType) -> Self { + match linux_cap { + LinuxCapabilityType::CAP_CHOWN => Capability::CAP_CHOWN, + LinuxCapabilityType::CAP_DAC_OVERRIDE => Capability::CAP_DAC_OVERRIDE, + LinuxCapabilityType::CAP_DAC_READ_SEARCH => Capability::CAP_DAC_READ_SEARCH, + LinuxCapabilityType::CAP_FOWNER => Capability::CAP_FOWNER, + LinuxCapabilityType::CAP_FSETID => Capability::CAP_FSETID, + LinuxCapabilityType::CAP_KILL => Capability::CAP_KILL, + LinuxCapabilityType::CAP_SETGID => Capability::CAP_SETGID, + LinuxCapabilityType::CAP_SETUID => Capability::CAP_SETUID, + LinuxCapabilityType::CAP_SETPCAP => Capability::CAP_SETPCAP, + LinuxCapabilityType::CAP_LINUX_IMMUTABLE => Capability::CAP_LINUX_IMMUTABLE, + LinuxCapabilityType::CAP_NET_BIND_SERVICE => Capability::CAP_NET_BIND_SERVICE, + LinuxCapabilityType::CAP_NET_BROADCAST => Capability::CAP_NET_BROADCAST, + LinuxCapabilityType::CAP_NET_ADMIN => Capability::CAP_NET_ADMIN, + LinuxCapabilityType::CAP_NET_RAW => Capability::CAP_NET_RAW, + LinuxCapabilityType::CAP_IPC_LOCK => Capability::CAP_IPC_LOCK, + LinuxCapabilityType::CAP_IPC_OWNER => Capability::CAP_IPC_OWNER, + LinuxCapabilityType::CAP_SYS_MODULE => Capability::CAP_SYS_MODULE, + LinuxCapabilityType::CAP_SYS_RAWIO => Capability::CAP_SYS_RAWIO, + LinuxCapabilityType::CAP_SYS_CHROOT => Capability::CAP_SYS_CHROOT, + LinuxCapabilityType::CAP_SYS_PTRACE => Capability::CAP_SYS_PTRACE, + LinuxCapabilityType::CAP_SYS_PACCT => Capability::CAP_SYS_PACCT, + LinuxCapabilityType::CAP_SYS_ADMIN => Capability::CAP_SYS_ADMIN, + LinuxCapabilityType::CAP_SYS_BOOT => Capability::CAP_SYS_BOOT, + LinuxCapabilityType::CAP_SYS_NICE => Capability::CAP_SYS_NICE, + LinuxCapabilityType::CAP_SYS_RESOURCE => Capability::CAP_SYS_RESOURCE, + LinuxCapabilityType::CAP_SYS_TIME => Capability::CAP_SYS_TIME, + LinuxCapabilityType::CAP_SYS_TTY_CONFIG => Capability::CAP_SYS_TTY_CONFIG, + LinuxCapabilityType::CAP_SYSLOG => Capability::CAP_SYSLOG, + LinuxCapabilityType::CAP_MKNOD => Capability::CAP_MKNOD, + LinuxCapabilityType::CAP_LEASE => Capability::CAP_LEASE, + LinuxCapabilityType::CAP_AUDIT_WRITE => Capability::CAP_AUDIT_WRITE, + LinuxCapabilityType::CAP_AUDIT_CONTROL => Capability::CAP_AUDIT_CONTROL, + LinuxCapabilityType::CAP_AUDIT_READ => Capability::CAP_AUDIT_READ, + LinuxCapabilityType::CAP_SETFCAP => Capability::CAP_SETFCAP, + LinuxCapabilityType::CAP_MAC_OVERRIDE => Capability::CAP_MAC_OVERRIDE, + LinuxCapabilityType::CAP_MAC_ADMIN => Capability::CAP_MAC_ADMIN, + LinuxCapabilityType::CAP_WAKE_ALARM => Capability::CAP_WAKE_ALARM, + LinuxCapabilityType::CAP_BLOCK_SUSPEND => Capability::CAP_BLOCK_SUSPEND, + LinuxCapabilityType::CAP_PERFMON => Capability::CAP_PERFMON, + LinuxCapabilityType::CAP_BPF => Capability::CAP_BPF, + LinuxCapabilityType::CAP_CHECKPOINT_RESTORE => Capability::CAP_CHECKPOINT_RESTORE, + } + } +} + +// see https://man7.org/linux/man-pages/man7/capabilities.7.html +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct LinuxCapabilities { + // Limiting superset for capabilities that can be added to the inheritable set (for security) + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub bounding: Vec, + // Capability set used by kernel to perform permission checks for container process + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub effective: Vec, + // set of capabilities preserved across an execve(2) + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub inheritable: Vec, + // Limiting superset for the effective capabilities that the container may assume + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub permitted: Vec, + // set of capabilities preserved across non root execve(2), + // capabilities must be both permitted and inheritable to be ambient + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub ambient: Vec, +} + +// Default container's linux capabilities: +// CAP_AUDIT_WRITE gives container ability to write to linux audit logs, +// CAP_KILL gives container ability to kill non root processes +// CAP_NET_BIND_SERVICE allows container to bind to ports below 1024 +impl Default for LinuxCapabilities { + fn default() -> Self { + let audit_write = LinuxCapabilityType::CAP_AUDIT_WRITE; + let cap_kill = LinuxCapabilityType::CAP_KILL; + let net_bind = LinuxCapabilityType::CAP_NET_BIND_SERVICE; + let default_vec = vec![audit_write, cap_kill, net_bind]; + LinuxCapabilities { + bounding: default_vec.clone(), + effective: default_vec.clone(), + inheritable: default_vec.clone(), + permitted: default_vec.clone(), + ambient: default_vec.clone(), + } + } +} diff --git a/oci_spec/src/test.rs b/oci_spec/src/test.rs new file mode 100644 index 0000000000..6ffa58b066 --- /dev/null +++ b/oci_spec/src/test.rs @@ -0,0 +1,44 @@ +#[cfg(test)] +use super::*; + +#[test] +fn test_caps_to_linux_caps() { + let spec: Spec = Default::default(); + if let Some(linux) = spec.process.capabilities { + let linux_caps = linux.bounding[0]; + let convert_caps: Capability = linux_caps.into(); + assert_eq!(convert_caps, Capability::CAP_AUDIT_WRITE); + assert_eq!( + linux_caps, + LinuxCapabilityType::from(Capability::CAP_AUDIT_WRITE) + ); + } +} + +#[test] +fn serialize_and_deserialize_spec() { + let spec: Spec = Default::default(); + let json_string = serde_json::to_string(&spec).unwrap(); + let new_spec = serde_json::from_str(&json_string).unwrap(); + assert_eq!(spec, new_spec); +} + +#[test] +fn test_linux_device_cgroup_to_string() { + let ldc = LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::A, + major: None, + minor: None, + access: "rwm".into(), + }; + assert_eq!(ldc.to_string(), "a *:* rwm"); + let ldc = LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::A, + major: Some(1), + minor: Some(9), + access: "rwm".into(), + }; + assert_eq!(ldc.to_string(), "a 1:9 rwm"); +} diff --git a/src/capabilities.rs b/src/capabilities.rs index 9e95c49f39..f6c1f2917d 100644 --- a/src/capabilities.rs +++ b/src/capabilities.rs @@ -1,43 +1,40 @@ -use crate::{ - command::Command, -}; +//! Handles Management of Capabilities +use crate::command::Syscall; use caps::*; use anyhow::Result; use oci_spec::{LinuxCapabilities, LinuxCapabilityType}; +/// Converts a list of capability types to capabilities has set fn to_set(caps: &[LinuxCapabilityType]) -> CapsHashSet { let mut capabilities = CapsHashSet::new(); for c in caps { - capabilities.insert(c.cap); + let caps = *c; + capabilities.insert(caps.into()); } capabilities } -pub fn reset_effective(command: &impl Command) -> Result<()> { +/// reset capabilities of process calling this to effective capabilities +/// effective capability set is set of capabilities used by kernel to perform checks +/// see https://man7.org/linux/man-pages/man7/capabilities.7.html for more information +pub fn reset_effective(syscall: &impl Syscall) -> Result<()> { log::debug!("reset all caps"); - command.set_capability(CapSet::Effective, &caps::all())?; + syscall.set_capability(CapSet::Effective, &caps::all())?; Ok(()) } -pub fn drop_privileges(cs: &LinuxCapabilities, command: &impl Command) -> Result<()> { - let all = caps::all(); +/// Drop any extra granted capabilities, and reset to defaults which are in oci specification +pub fn drop_privileges(cs: &LinuxCapabilities, syscall: &impl Syscall) -> Result<()> { log::debug!("dropping bounding capabilities to {:?}", cs.bounding); - for c in all.difference(&to_set(&cs.bounding)) { - match c { - Capability::CAP_PERFMON | Capability::CAP_CHECKPOINT_RESTORE | Capability::CAP_BPF => { - log::warn!("{:?} doesn't support.", c); - continue; - } - _ => caps::drop(None, CapSet::Bounding, *c)?, - } - } + syscall.set_capability(CapSet::Bounding, &to_set(&cs.bounding))?; - command.set_capability(CapSet::Effective, &to_set(&cs.effective))?; - command.set_capability(CapSet::Permitted, &to_set(&cs.permitted))?; - command.set_capability(CapSet::Inheritable, &to_set(&cs.inheritable))?; + syscall.set_capability(CapSet::Effective, &to_set(&cs.effective))?; + syscall.set_capability(CapSet::Permitted, &to_set(&cs.permitted))?; + syscall.set_capability(CapSet::Inheritable, &to_set(&cs.inheritable))?; - if let Err(e) = command.set_capability(CapSet::Ambient, &to_set(&cs.ambient)) { + // check specifically for ambient, as those might not always be available + if let Err(e) = syscall.set_capability(CapSet::Ambient, &to_set(&cs.ambient)) { log::error!("failed to set ambient capabilities: {}", e); } Ok(()) @@ -46,11 +43,11 @@ pub fn drop_privileges(cs: &LinuxCapabilities, command: &impl Command) -> Result #[cfg(test)] mod tests { use super::*; - use crate::command::test::TestHelperCommand; + use crate::command::test::TestHelperSyscall; #[test] fn test_reset_effective() { - let test_command = TestHelperCommand::default(); + let test_command = TestHelperSyscall::default(); assert!(reset_effective(&test_command).is_ok()); let set_capability_args: Vec<_> = test_command .get_set_capability_args() diff --git a/src/cgroups/common.rs b/src/cgroups/common.rs new file mode 100644 index 0000000000..02c373dfbd --- /dev/null +++ b/src/cgroups/common.rs @@ -0,0 +1,164 @@ +use std::{ + env, + fmt::{Debug, Display}, + fs, + io::Write, + path::{Path, PathBuf}, +}; + +use anyhow::{bail, Context, Result}; +use nix::unistd::Pid; +use oci_spec::{FreezerState, LinuxResources}; +use procfs::process::Process; +use systemd::daemon::booted; + +use crate::cgroups::v1; +use crate::cgroups::v2; + +pub const CGROUP_PROCS: &str = "cgroup.procs"; +pub const DEFAULT_CGROUP_ROOT: &str = "/sys/fs/cgroup"; + +pub trait CgroupManager { + /// Adds a task specified by its pid to the cgroup + fn add_task(&self, pid: Pid) -> Result<()>; + /// Applies resource restrictions to the cgroup + fn apply(&self, linux_resources: &LinuxResources) -> Result<()>; + /// Removes the cgroup + fn remove(&self) -> Result<()>; + // Sets the freezer cgroup to the specified state + fn freeze(&self, state: FreezerState) -> Result<()>; +} + +#[derive(Debug)] +pub enum Cgroup { + V1, + V2, +} + +impl Display for Cgroup { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let print = match *self { + Cgroup::V1 => "v1", + Cgroup::V2 => "v2", + }; + + write!(f, "{}", print) + } +} + +#[inline] +pub fn write_cgroup_file_str>(path: P, data: &str) -> Result<()> { + fs::OpenOptions::new() + .create(false) + .write(true) + .truncate(false) + .open(path.as_ref()) + .with_context(|| format!("failed to open {:?}", path.as_ref()))? + .write_all(data.as_bytes()) + .with_context(|| format!("failed to write to {:?}", path.as_ref()))?; + + Ok(()) +} + +#[inline] +pub fn write_cgroup_file, T: ToString>(path: P, data: T) -> Result<()> { + fs::OpenOptions::new() + .create(false) + .write(true) + .truncate(false) + .open(path.as_ref()) + .with_context(|| format!("failed to open {:?}", path.as_ref()))? + .write_all(data.to_string().as_bytes()) + .with_context(|| format!("failed to write to {:?}", path.as_ref()))?; + + Ok(()) +} + +pub fn get_supported_cgroup_fs() -> Result> { + let cgroup_mount = Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup"); + + let cgroup2_mount = Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup2"); + + let mut cgroups = vec![]; + if cgroup_mount.is_some() { + cgroups.push(Cgroup::V1); + } + + if cgroup2_mount.is_some() { + cgroups.push(Cgroup::V2); + } + + Ok(cgroups) +} + +pub fn create_cgroup_manager>( + cgroup_path: P, + systemd_cgroup: bool, +) -> Result> { + let cgroup_mount = Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup"); + + let cgroup2_mount = Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup2"); + + match (cgroup_mount, cgroup2_mount) { + (Some(_), None) => { + log::info!("cgroup manager V1 will be used"); + Ok(Box::new(v1::manager::Manager::new(cgroup_path.into())?)) + } + (None, Some(cgroup2)) => { + log::info!("cgroup manager V2 will be used"); + if systemd_cgroup { + if !booted()? { + bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); + } + log::info!("systemd cgroup manager will be used"); + return Ok(Box::new(v2::SystemDCGroupManager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)); + } + Ok(Box::new(v2::manager::Manager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)) + } + (Some(_), Some(cgroup2)) => { + let cgroup_override = env::var("YOUKI_PREFER_CGROUPV2"); + match cgroup_override { + Ok(v) if v == "true" => { + log::info!("cgroup manager V2 will be used"); + if systemd_cgroup { + if !booted()? { + bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available"); + } + log::info!("systemd cgroup manager will be used"); + return Ok(Box::new(v2::SystemDCGroupManager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)); + } + Ok(Box::new(v2::manager::Manager::new( + cgroup2.mount_point, + cgroup_path.into(), + )?)) + } + _ => { + log::info!("cgroup manager V1 will be used"); + Ok(Box::new(v1::manager::Manager::new(cgroup_path.into())?)) + } + } + } + _ => bail!("could not find cgroup filesystem"), + } +} diff --git a/src/cgroups/controller.rs b/src/cgroups/controller.rs deleted file mode 100644 index 84e0b3cc2b..0000000000 --- a/src/cgroups/controller.rs +++ /dev/null @@ -1,10 +0,0 @@ -use std::path::Path; - -use anyhow::Result; -use nix::unistd::Pid; - -use oci_spec::LinuxResources; - -pub trait Controller { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()>; -} diff --git a/src/cgroups/devices.rs b/src/cgroups/devices.rs deleted file mode 100644 index ff64afeda1..0000000000 --- a/src/cgroups/devices.rs +++ /dev/null @@ -1,137 +0,0 @@ -use std::io::Write; -use std::{ - fs::{create_dir_all, OpenOptions}, - path::Path, -}; - -use anyhow::Result; -use nix::unistd::Pid; - -use crate::{ - cgroups::Controller, - rootfs::default_devices, -}; -use oci_spec::{LinuxDeviceCgroup, LinuxDeviceType, LinuxResources}; - -pub struct Devices {} - -impl Controller for Devices { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { - log::debug!("Apply Devices cgroup config"); - create_dir_all(&cgroup_root)?; - - for d in &linux_resources.devices { - Self::apply_device(d, cgroup_root)?; - } - - for d in [ - default_devices().iter().map(|d| d.into()).collect(), - Self::default_allow_devices(), - ] - .concat() - { - Self::apply_device(&d, &cgroup_root)?; - } - - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; - Ok(()) - } -} - -impl Devices { - fn apply_device(device: &LinuxDeviceCgroup, cgroup_root: &Path) -> Result<()> { - let path = if device.allow { - cgroup_root.join("devices.allow") - } else { - cgroup_root.join("devices.deny") - }; - - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(path)? - .write_all(device.to_string().as_bytes())?; - Ok(()) - } - - fn default_allow_devices() -> Vec { - vec![ - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::C, - major: None, - minor: None, - access: "m".to_string(), - }, - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::B, - major: None, - minor: None, - access: "m".to_string(), - }, - // /dev/console - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::C, - major: Some(5), - minor: Some(1), - access: "rwm".to_string(), - }, - // /dev/pts - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::C, - major: Some(136), - minor: None, - access: "rwm".to_string(), - }, - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::C, - major: Some(5), - minor: Some(2), - access: "rwm".to_string(), - }, - // tun/tap - LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::C, - major: Some(10), - minor: Some(200), - access: "rwm".to_string(), - }, - ] - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_linux_device_cgroup_to_string() { - let ldc = LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::A, - major: None, - minor: None, - access: "rwm".into(), - }; - assert_eq!(ldc.to_string(), "a *:* rwm"); - let ldc = LinuxDeviceCgroup { - allow: true, - typ: LinuxDeviceType::A, - major: Some(1), - minor: Some(9), - access: "rwm".into(), - }; - assert_eq!(ldc.to_string(), "a 1:9 rwm"); - } -} diff --git a/src/cgroups/hugetlb.rs b/src/cgroups/hugetlb.rs deleted file mode 100644 index 3e74cd216e..0000000000 --- a/src/cgroups/hugetlb.rs +++ /dev/null @@ -1,132 +0,0 @@ -use std::{ - fs::{self, OpenOptions}, - io::Write, - path::Path, -}; - -use anyhow::anyhow; -use regex::Regex; - -use crate::{ - cgroups::Controller, -}; -use oci_spec::{LinuxHugepageLimit, LinuxResources}; - -pub struct Hugetlb {} - -impl Controller for Hugetlb { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &std::path::Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { - log::debug!("Apply Hugetlb cgroup config"); - fs::create_dir_all(cgroup_root)?; - - for hugetlb in &linux_resources.hugepage_limits { - Self::apply(cgroup_root, hugetlb)? - } - - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; - Ok(()) - } -} - -impl Hugetlb { - fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> anyhow::Result<()> { - let re = Regex::new(r"(?P[0-9]+)[KMG]B")?; - let caps = re.captures(&hugetlb.page_size); - match caps { - None => return Err(anyhow!("page size must be in the format [0-9]+[KMG]B")), - Some(caps) => { - let page_size: u64 = caps["pagesize"].parse()?; - if !Self::is_power_of_two(page_size) { - return Err(anyhow!("page size must be in the format of 2^(integer)")); - } - } - } - - Self::write_file( - &root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), - &hugetlb.limit.to_string(), - )?; - Ok(()) - } - - fn write_file(file_path: &Path, data: &str) -> anyhow::Result<()> { - fs::OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(file_path)? - .write_all(data.as_bytes())?; - - Ok(()) - } - - fn is_power_of_two(number: u64) -> bool { - (number != 0) && (number & (number - 1)) == 0 - } -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use super::*; - use oci_spec::LinuxHugepageLimit; - - fn set_fixture(temp_dir: &std::path::Path, filename: &str, val: &str) -> anyhow::Result<()> { - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(temp_dir.join(filename))? - .write_all(val.as_bytes())?; - - Ok(()) - } - - fn create_temp_dir(test_name: &str) -> anyhow::Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - - #[test] - fn test_set_hugetlb() { - let page_file_name = "hugetlb.2MB.limit_in_bytes"; - let tmp = create_temp_dir("test_set_hugetlb").expect("create temp directory for test"); - set_fixture(&tmp, page_file_name, "0").expect("Set fixture for 2 MB page size"); - - let hugetlb = LinuxHugepageLimit { - page_size: "2MB".to_owned(), - limit: 16384, - }; - Hugetlb::apply(&tmp, &hugetlb).expect("apply hugetlb"); - let content = - std::fs::read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); - assert_eq!(hugetlb.limit.to_string(), content); - } - - #[test] - fn test_set_hugetlb_with_invalid_page_size() { - let tmp = create_temp_dir("test_set_hugetlb_with_invalid_page_size") - .expect("create temp directory for test"); - - let hugetlb = LinuxHugepageLimit { - page_size: "3MB".to_owned(), - limit: 16384, - }; - - let result = Hugetlb::apply(&tmp, &hugetlb); - assert!( - result.is_err(), - "page size that is not a power of two should be an error" - ); - } -} diff --git a/src/cgroups/manager.rs b/src/cgroups/manager.rs deleted file mode 100644 index 8a74ef117e..0000000000 --- a/src/cgroups/manager.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::{collections::HashMap, path::PathBuf}; -use std::{fs::remove_dir, path::Path}; - -use anyhow::Result; -use nix::unistd::Pid; -use procfs::process::Process; - -use crate::{cgroups::ControllerType, utils::PathBufExt}; -use oci_spec::LinuxResources; -use super::{ - blkio::Blkio, devices::Devices, hugetlb::Hugetlb, memory::Memory, - network_classifier::NetworkClassifier, network_priority::NetworkPriority, pids::Pids, - Controller, -}; - -const CONTROLLERS: &[ControllerType] = &[ - ControllerType::Devices, - ControllerType::HugeTlb, - ControllerType::Memory, - ControllerType::Pids, - ControllerType::Blkio, - ControllerType::NetworkPriority, - ControllerType::NetworkClassifier, -]; - -pub struct Manager { - subsystems: HashMap, -} - -impl Manager { - pub fn new(cgroup_path: PathBuf) -> Result { - let mut subsystems = HashMap::::new(); - for subsystem in CONTROLLERS.iter().map(|c| c.to_string()) { - subsystems.insert( - subsystem.to_owned(), - Self::get_subsystem_path(&cgroup_path, &subsystem)?, - ); - } - - Ok(Manager { subsystems }) - } - - pub fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()> { - for subsys in &self.subsystems { - match subsys.0.as_str() { - "devices" => Devices::apply(linux_resources, &subsys.1, pid)?, - "hugetlb" => Hugetlb::apply(linux_resources, &subsys.1, pid)?, - "memory" => Memory::apply(linux_resources, &subsys.1, pid)?, - "pids" => Pids::apply(linux_resources, &subsys.1, pid)?, - "blkio" => Blkio::apply(linux_resources, &subsys.1, pid)?, - "net_prio" => NetworkPriority::apply(linux_resources, &subsys.1, pid)?, - "net_cls" => NetworkClassifier::apply(linux_resources, &subsys.1, pid)?, - _ => continue, - } - } - - Ok(()) - } - - pub fn remove(&self) -> Result<()> { - for cgroup_path in &self.subsystems { - if cgroup_path.1.exists() { - log::debug!("remove cgroup {:?}", cgroup_path.1); - remove_dir(&cgroup_path.1)?; - } - } - - Ok(()) - } - - fn get_subsystem_path(cgroup_path: &Path, subsystem: &str) -> anyhow::Result { - log::debug!("Get path for subsystem: {}", subsystem); - let mount = Process::myself()? - .mountinfo()? - .into_iter() - .find(|m| { - if m.fs_type == "cgroup" { - // Some systems mount net_prio and net_cls in the same directory - // other systems mount them in their own diretories. This - // should handle both cases. - if subsystem == "net_cls" || subsystem == "net_prio" { - return m.mount_point.ends_with("net_cls,net_prio") - || m.mount_point.ends_with("net_prio,net_cls"); - } - } - m.mount_point.ends_with(subsystem) - }) - .unwrap(); - - let cgroup = Process::myself()? - .cgroups()? - .into_iter() - .find(|c| c.controllers.contains(&subsystem.to_owned())) - .unwrap(); - - let p = if cgroup_path.to_string_lossy().into_owned().is_empty() { - mount - .mount_point - .join_absolute_path(Path::new(&cgroup.pathname))? - } else { - mount.mount_point.join_absolute_path(&cgroup_path)? - }; - - Ok(p) - } -} diff --git a/src/cgroups/mod.rs b/src/cgroups/mod.rs index 9eb044b8a4..f81d697af5 100644 --- a/src/cgroups/mod.rs +++ b/src/cgroups/mod.rs @@ -1,13 +1,8 @@ -mod controller; -mod controller_type; -mod devices; -mod hugetlb; -mod blkio; -mod manager; -mod memory; -mod network_classifier; -mod network_priority; -mod pids; -pub use controller::Controller; -pub use controller_type::ControllerType; -pub use manager::Manager; +//! Control groups provide a way of controlling groups of processes. +//! Examples: controlling resource limits, execution priority, measuring resource usage, +//! freezing, checkpointing and restarting groups of processes. + +pub mod common; +mod test; +pub mod v1; +pub mod v2; diff --git a/src/cgroups/network_classifier.rs b/src/cgroups/network_classifier.rs deleted file mode 100644 index c6df7448d6..0000000000 --- a/src/cgroups/network_classifier.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::io::Write; -use std::{ - fs::{create_dir_all, OpenOptions}, - path::Path, -}; - -use anyhow::Result; -use nix::unistd::Pid; - -use crate::{ - cgroups::Controller, -}; -use oci_spec::{LinuxNetwork, LinuxResources}; - -pub struct NetworkClassifier {} - -impl Controller for NetworkClassifier { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { - log::debug!("Apply NetworkClassifier cgroup config"); - create_dir_all(&cgroup_root)?; - - if let Some(network) = linux_resources.network.as_ref() { - Self::apply(cgroup_root, network)?; - - OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; - } - - Ok(()) - } -} - -impl NetworkClassifier { - fn apply(root_path: &Path, network: &LinuxNetwork) -> Result<()> { - if let Some(class_id) = network.class_id { - Self::write_file(&root_path.join("net_cls.classid"), &class_id.to_string())?; - } - - Ok(()) - } - - fn write_file(file_path: &Path, data: &str) -> Result<()> { - OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(file_path)? - .write_all(data.as_bytes())?; - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use super::*; - - fn set_fixture(temp_dir: &std::path::Path, filename: &str, val: &str) -> Result<()> { - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(temp_dir.join(filename))? - .write_all(val.as_bytes())?; - - Ok(()) - } - - fn create_temp_dir(test_name: &str) -> Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - - #[test] - fn test_apply_network_classifier() { - let tmp = create_temp_dir("test_apply_network_classifier") - .expect("create temp directory for test"); - set_fixture(&tmp, "net_cls.classid", "0").expect("set fixture for classID"); - - let id = 0x100001; - let network = LinuxNetwork { - class_id: Some(id), - priorities: vec![], - }; - - NetworkClassifier::apply(&tmp, &network).expect("apply network classID"); - - let content = - std::fs::read_to_string(tmp.join("net_cls.classid")).expect("Read classID contents"); - assert_eq!(id.to_string(), content); - } -} diff --git a/src/cgroups/test.rs b/src/cgroups/test.rs new file mode 100644 index 0000000000..57ad71ef83 --- /dev/null +++ b/src/cgroups/test.rs @@ -0,0 +1,91 @@ +#![cfg(test)] + +use anyhow::Result; +use std::{ + io::Write, + path::{Path, PathBuf}, +}; + +use oci_spec::LinuxCpu; + +use crate::utils::{create_temp_dir, TempDir}; + +pub fn setup(testname: &str, cgroup_file: &str) -> (TempDir, PathBuf) { + let tmp = create_temp_dir(testname).expect("create temp directory for test"); + let cgroup_file = set_fixture(&tmp, cgroup_file, "") + .unwrap_or_else(|_| panic!("set test fixture for {}", cgroup_file)); + + (tmp, cgroup_file) +} + +pub fn set_fixture(temp_dir: &Path, filename: &str, val: &str) -> Result { + let full_path = temp_dir.join(filename); + + std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&full_path)? + .write_all(val.as_bytes())?; + + Ok(full_path) +} + +pub struct LinuxCpuBuilder { + resource: LinuxCpu, +} + +impl LinuxCpuBuilder { + pub fn new() -> Self { + Self { + resource: LinuxCpu { + shares: None, + quota: None, + period: None, + realtime_runtime: None, + realtime_period: None, + cpus: None, + mems: None, + }, + } + } + + pub fn with_shares(mut self, shares: u64) -> Self { + self.resource.shares = Some(shares); + self + } + + pub fn with_quota(mut self, quota: i64) -> Self { + self.resource.quota = Some(quota); + self + } + + pub fn with_period(mut self, period: u64) -> Self { + self.resource.period = Some(period); + self + } + + pub fn with_realtime_runtime(mut self, runtime: i64) -> Self { + self.resource.realtime_runtime = Some(runtime); + self + } + + pub fn with_realtime_period(mut self, period: u64) -> Self { + self.resource.realtime_period = Some(period); + self + } + + pub fn with_cpus(mut self, cpus: String) -> Self { + self.resource.cpus = Some(cpus); + self + } + + pub fn with_mems(mut self, mems: String) -> Self { + self.resource.mems = Some(mems); + self + } + + pub fn build(self) -> LinuxCpu { + self.resource + } +} diff --git a/src/cgroups/blkio.rs b/src/cgroups/v1/blkio.rs similarity index 63% rename from src/cgroups/blkio.rs rename to src/cgroups/v1/blkio.rs index feb319ab32..3480084c11 100644 --- a/src/cgroups/blkio.rs +++ b/src/cgroups/v1/blkio.rs @@ -1,12 +1,7 @@ -use std::{ - fs::{self, OpenOptions}, - io::Write, - path::Path, -}; - -use crate::{ - cgroups::Controller, -}; +use std::path::Path; + +use crate::cgroups::{common, v1::Controller}; +use anyhow::Result; use oci_spec::{LinuxBlockIo, LinuxResources}; const CGROUP_BLKIO_THROTTLE_READ_BPS: &str = "blkio.throttle.read_bps_device"; @@ -17,55 +12,52 @@ const CGROUP_BLKIO_THROTTLE_WRITE_IOPS: &str = "blkio.throttle.write_iops_device pub struct Blkio {} impl Controller for Blkio { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { - match &linux_resources.block_io { - None => return Ok(()), - Some(block_io) => { - fs::create_dir_all(cgroup_root)?; - Self::apply(cgroup_root, block_io)?; - } - } + type Resource = LinuxBlockIo; - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply blkio cgroup config"); + + if let Some(blkio) = Self::needs_to_handle(linux_resources) { + Self::apply(cgroup_root, blkio)?; + } Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(blkio) = &linux_resources.block_io { + return Some(blkio); + } + + None + } } impl Blkio { - fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> anyhow::Result<()> { + fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> Result<()> { for trbd in &blkio.blkio_throttle_read_bps_device { - Self::write_file( + common::write_cgroup_file_str( &root_path.join(CGROUP_BLKIO_THROTTLE_READ_BPS), &format!("{}:{} {}", trbd.major, trbd.minor, trbd.rate), )?; } for twbd in &blkio.blkio_throttle_write_bps_device { - Self::write_file( + common::write_cgroup_file_str( &root_path.join(CGROUP_BLKIO_THROTTLE_WRITE_BPS), &format!("{}:{} {}", twbd.major, twbd.minor, twbd.rate), )?; } for trid in &blkio.blkio_throttle_read_iops_device { - Self::write_file( + common::write_cgroup_file_str( &root_path.join(CGROUP_BLKIO_THROTTLE_READ_IOPS), &format!("{}:{} {}", trid.major, trid.minor, trid.rate), )?; } for twid in &blkio.blkio_throttle_write_iops_device { - Self::write_file( + common::write_cgroup_file_str( &root_path.join(CGROUP_BLKIO_THROTTLE_WRITE_IOPS), &format!("{}:{} {}", twid.major, twid.minor, twid.rate), )?; @@ -73,24 +65,14 @@ impl Blkio { Ok(()) } - - fn write_file(file_path: &Path, data: &str) -> anyhow::Result<()> { - fs::OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(file_path)? - .write_all(data.as_bytes())?; - - Ok(()) - } } #[cfg(test)] mod tests { - use std::path::PathBuf; + use std::fs; use super::*; + use crate::cgroups::test::setup; use oci_spec::{LinuxBlockIo, LinuxThrottleDevice}; struct BlockIoBuilder { @@ -137,40 +119,9 @@ mod tests { } } - fn setup(testname: &str, throttle_type: &str) -> (PathBuf, PathBuf) { - let tmp = create_temp_dir(testname).expect("create temp directory for test"); - let throttle_file = set_fixture(&tmp, throttle_type, "") - .unwrap_or_else(|_| panic!("set fixture for {}", throttle_type)); - - (tmp, throttle_file) - } - - fn set_fixture( - temp_dir: &std::path::Path, - filename: &str, - val: &str, - ) -> anyhow::Result { - let full_path = temp_dir.join(filename); - - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(&full_path)? - .write_all(val.as_bytes())?; - - Ok(full_path) - } - - fn create_temp_dir(test_name: &str) -> anyhow::Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - #[test] fn test_set_blkio_read_bps() { - let (test_root, throttle) = - setup("test_set_blkio_read_bps", CGROUP_BLKIO_THROTTLE_READ_BPS); + let (tmp, throttle) = setup("test_set_blkio_read_bps", CGROUP_BLKIO_THROTTLE_READ_BPS); let blkio = BlockIoBuilder::new() .with_read_bps(vec![LinuxThrottleDevice { @@ -180,7 +131,7 @@ mod tests { }]) .build(); - Blkio::apply(&test_root, &blkio).expect("apply blkio"); + Blkio::apply(&tmp, &blkio).expect("apply blkio"); let content = fs::read_to_string(throttle) .unwrap_or_else(|_| panic!("read {} content", CGROUP_BLKIO_THROTTLE_READ_BPS)); @@ -189,8 +140,7 @@ mod tests { #[test] fn test_set_blkio_write_bps() { - let (test_root, throttle) = - setup("test_set_blkio_write_bps", CGROUP_BLKIO_THROTTLE_WRITE_BPS); + let (tmp, throttle) = setup("test_set_blkio_write_bps", CGROUP_BLKIO_THROTTLE_WRITE_BPS); let blkio = BlockIoBuilder::new() .with_write_bps(vec![LinuxThrottleDevice { @@ -200,7 +150,7 @@ mod tests { }]) .build(); - Blkio::apply(&test_root, &blkio).expect("apply blkio"); + Blkio::apply(&tmp, &blkio).expect("apply blkio"); let content = fs::read_to_string(throttle) .unwrap_or_else(|_| panic!("read {} content", CGROUP_BLKIO_THROTTLE_WRITE_BPS)); @@ -209,8 +159,7 @@ mod tests { #[test] fn test_set_blkio_read_iops() { - let (test_root, throttle) = - setup("test_set_blkio_read_iops", CGROUP_BLKIO_THROTTLE_READ_IOPS); + let (tmp, throttle) = setup("test_set_blkio_read_iops", CGROUP_BLKIO_THROTTLE_READ_IOPS); let blkio = BlockIoBuilder::new() .with_read_iops(vec![LinuxThrottleDevice { @@ -220,7 +169,7 @@ mod tests { }]) .build(); - Blkio::apply(&test_root, &blkio).expect("apply blkio"); + Blkio::apply(&tmp, &blkio).expect("apply blkio"); let content = fs::read_to_string(throttle) .unwrap_or_else(|_| panic!("read {} content", CGROUP_BLKIO_THROTTLE_READ_IOPS)); @@ -229,7 +178,7 @@ mod tests { #[test] fn test_set_blkio_write_iops() { - let (test_root, throttle) = setup( + let (tmp, throttle) = setup( "test_set_blkio_write_iops", CGROUP_BLKIO_THROTTLE_WRITE_IOPS, ); @@ -242,7 +191,7 @@ mod tests { }]) .build(); - Blkio::apply(&test_root, &blkio).expect("apply blkio"); + Blkio::apply(&tmp, &blkio).expect("apply blkio"); let content = fs::read_to_string(throttle) .unwrap_or_else(|_| panic!("read {} content", CGROUP_BLKIO_THROTTLE_WRITE_IOPS)); diff --git a/src/cgroups/v1/controller.rs b/src/cgroups/v1/controller.rs new file mode 100644 index 0000000000..9aaa8fcae4 --- /dev/null +++ b/src/cgroups/v1/controller.rs @@ -0,0 +1,25 @@ +use std::{fs, path::Path}; + +use anyhow::Result; +use nix::unistd::Pid; + +use oci_spec::LinuxResources; + +use crate::cgroups::common::{self, CGROUP_PROCS}; + +pub trait Controller { + type Resource; + + /// Adds a new task specified by its pid to the cgroup + fn add_task(pid: Pid, cgroup_path: &Path) -> Result<()> { + fs::create_dir_all(cgroup_path)?; + common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; + Ok(()) + } + + /// Applies resource restrictions to the cgroup + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()>; + + /// Checks if the controller needs to handle this request + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource>; +} diff --git a/src/cgroups/v1/controller_type.rs b/src/cgroups/v1/controller_type.rs new file mode 100644 index 0000000000..6fbf0f37f7 --- /dev/null +++ b/src/cgroups/v1/controller_type.rs @@ -0,0 +1,50 @@ +use std::fmt::Display; + +#[derive(Hash, PartialEq, Eq, Debug, Clone)] +pub enum ControllerType { + Cpu, + CpuAcct, + CpuSet, + Devices, + HugeTlb, + Pids, + Memory, + Blkio, + NetworkPriority, + NetworkClassifier, + Freezer, +} + +impl Display for ControllerType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let print = match *self { + Self::Cpu => "cpu", + Self::CpuAcct => "cpuacct", + Self::CpuSet => "cpuset", + Self::Devices => "devices", + Self::HugeTlb => "hugetlb", + Self::Pids => "pids", + Self::Memory => "memory", + Self::Blkio => "blkio", + Self::NetworkPriority => "net_prio", + Self::NetworkClassifier => "net_cls", + Self::Freezer => "freezer", + }; + + write!(f, "{}", print) + } +} + +pub const CONTROLLERS: &[ControllerType] = &[ + ControllerType::Cpu, + ControllerType::CpuAcct, + ControllerType::CpuSet, + ControllerType::Devices, + ControllerType::HugeTlb, + ControllerType::Memory, + ControllerType::Pids, + ControllerType::Blkio, + ControllerType::NetworkPriority, + ControllerType::NetworkClassifier, + ControllerType::Freezer, +]; diff --git a/src/cgroups/v1/cpu.rs b/src/cgroups/v1/cpu.rs new file mode 100644 index 0000000000..006bc09cf5 --- /dev/null +++ b/src/cgroups/v1/cpu.rs @@ -0,0 +1,171 @@ +use std::path::Path; + +use anyhow::Result; +use oci_spec::{LinuxCpu, LinuxResources}; + +use crate::cgroups::common; + +use super::Controller; + +const CGROUP_CPU_SHARES: &str = "cpu.shares"; +const CGROUP_CPU_QUOTA: &str = "cpu.cfs_quota_us"; +const CGROUP_CPU_PERIOD: &str = "cpu.cfs_period_us"; +const CGROUP_CPU_RT_RUNTIME: &str = "cpu.rt_runtime_us"; +const CGROUP_CPU_RT_PERIOD: &str = "cpu.rt_period_us"; + +pub struct Cpu {} + +impl Controller for Cpu { + type Resource = LinuxCpu; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply Cpu cgroup config"); + + if let Some(cpu) = Self::needs_to_handle(linux_resources) { + Self::apply(cgroup_root, cpu)?; + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(cpu) = &linux_resources.cpu { + if cpu.shares.is_some() + || cpu.period.is_some() + || cpu.quota.is_some() + || cpu.realtime_period.is_some() + || cpu.realtime_runtime.is_some() + { + return Some(cpu); + } + } + + None + } +} + +impl Cpu { + fn apply(root_path: &Path, cpu: &LinuxCpu) -> Result<()> { + if let Some(cpu_shares) = cpu.shares { + if cpu_shares != 0 { + common::write_cgroup_file(root_path.join(CGROUP_CPU_SHARES), cpu_shares)?; + } + } + + if let Some(cpu_period) = cpu.period { + if cpu_period != 0 { + common::write_cgroup_file(root_path.join(CGROUP_CPU_PERIOD), cpu_period)?; + } + } + + if let Some(cpu_quota) = cpu.quota { + if cpu_quota != 0 { + common::write_cgroup_file(root_path.join(CGROUP_CPU_QUOTA), cpu_quota)?; + } + } + + if let Some(rt_runtime) = cpu.realtime_runtime { + if rt_runtime != 0 { + common::write_cgroup_file(root_path.join(CGROUP_CPU_RT_RUNTIME), rt_runtime)?; + } + } + + if let Some(rt_period) = cpu.realtime_period { + if rt_period != 0 { + common::write_cgroup_file(root_path.join(CGROUP_CPU_RT_PERIOD), rt_period)?; + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::{set_fixture, setup, LinuxCpuBuilder}; + use std::fs; + + #[test] + fn test_set_shares() { + // arrange + let (tmp, shares) = setup("test_set_shares", CGROUP_CPU_SHARES); + let _ = set_fixture(&tmp, CGROUP_CPU_SHARES, "") + .unwrap_or_else(|_| panic!("set test fixture for {}", CGROUP_CPU_SHARES)); + let cpu = LinuxCpuBuilder::new().with_shares(2048).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(shares) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_SHARES)); + assert_eq!(content, 2048.to_string()); + } + + #[test] + fn test_set_quota() { + // arrange + const QUOTA: i64 = 200000; + let (tmp, max) = setup("test_set_quota", CGROUP_CPU_QUOTA); + let cpu = LinuxCpuBuilder::new().with_quota(QUOTA).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_QUOTA)); + assert_eq!(content, QUOTA.to_string()); + } + + #[test] + fn test_set_period() { + // arrange + const PERIOD: u64 = 100000; + let (tmp, max) = setup("test_set_period", CGROUP_CPU_PERIOD); + let cpu = LinuxCpuBuilder::new().with_period(PERIOD).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_PERIOD)); + assert_eq!(content, PERIOD.to_string()); + } + + #[test] + fn test_set_rt_runtime() { + // arrange + const RUNTIME: i64 = 100000; + let (tmp, max) = setup("test_set_rt_runtime", CGROUP_CPU_RT_RUNTIME); + let cpu = LinuxCpuBuilder::new() + .with_realtime_runtime(RUNTIME) + .build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_RT_RUNTIME)); + assert_eq!(content, RUNTIME.to_string()); + } + + #[test] + fn test_set_rt_period() { + // arrange + const PERIOD: u64 = 100000; + let (tmp, max) = setup("test_set_rt_period", CGROUP_CPU_RT_PERIOD); + let cpu = LinuxCpuBuilder::new().with_realtime_period(PERIOD).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_RT_PERIOD)); + assert_eq!(content, PERIOD.to_string()); + } +} diff --git a/src/cgroups/v1/cpuacct.rs b/src/cgroups/v1/cpuacct.rs new file mode 100644 index 0000000000..2632847e24 --- /dev/null +++ b/src/cgroups/v1/cpuacct.rs @@ -0,0 +1,43 @@ +use std::path::Path; + +use anyhow::Result; +use oci_spec::LinuxResources; + +use super::Controller; + +pub struct CpuAcct {} + +impl Controller for CpuAcct { + type Resource = (); + + fn apply(_linux_resources: &LinuxResources, _cgroup_path: &Path) -> Result<()> { + Ok(()) + } + + // apply never needs to be called, for accounting only + fn needs_to_handle(_linux_resources: &LinuxResources) -> Option<&Self::Resource> { + None + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use nix::unistd::Pid; + + use super::*; + use crate::cgroups::{common::CGROUP_PROCS, test::setup}; + + #[test] + fn test_add_task() { + let (tmp, procs) = setup("test_cpuacct_apply", CGROUP_PROCS); + let pid = Pid::from_raw(1000); + + CpuAcct::add_task(pid, &tmp).expect("apply cpuacct"); + + let content = fs::read_to_string(&procs) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_PROCS)); + assert_eq!(content, "1000"); + } +} diff --git a/src/cgroups/v1/cpuset.rs b/src/cgroups/v1/cpuset.rs new file mode 100644 index 0000000000..b7d6693115 --- /dev/null +++ b/src/cgroups/v1/cpuset.rs @@ -0,0 +1,126 @@ +use std::{fs, path::Path}; + +use anyhow::{bail, Result}; +use nix::unistd; +use oci_spec::{LinuxCpu, LinuxResources}; +use unistd::Pid; + +use crate::cgroups::common::{self, CGROUP_PROCS}; + +use super::{util, Controller, ControllerType}; + +const CGROUP_CPUSET_CPUS: &str = "cpuset.cpus"; +const CGROUP_CPUSET_MEMS: &str = "cpuset.mems"; + +pub struct CpuSet {} + +impl Controller for CpuSet { + type Resource = LinuxCpu; + + fn add_task(pid: Pid, cgroup_path: &Path) -> Result<()> { + fs::create_dir_all(cgroup_path)?; + + Self::ensure_not_empty(cgroup_path, CGROUP_CPUSET_CPUS)?; + Self::ensure_not_empty(cgroup_path, CGROUP_CPUSET_MEMS)?; + + common::write_cgroup_file(cgroup_path.join(CGROUP_PROCS), pid)?; + Ok(()) + } + + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + log::debug!("Apply CpuSet cgroup config"); + + if let Some(cpuset) = Self::needs_to_handle(linux_resources) { + Self::apply(cgroup_path, cpuset)?; + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(cpuset) = &linux_resources.cpu { + if cpuset.cpus.is_some() || cpuset.mems.is_some() { + return Some(cpuset); + } + } + + None + } +} + +impl CpuSet { + fn apply(cgroup_path: &Path, cpuset: &LinuxCpu) -> Result<()> { + if let Some(cpus) = &cpuset.cpus { + common::write_cgroup_file_str(cgroup_path.join(CGROUP_CPUSET_CPUS), cpus)?; + } + + if let Some(mems) = &cpuset.mems { + common::write_cgroup_file_str(cgroup_path.join(CGROUP_CPUSET_MEMS), mems)?; + } + + Ok(()) + } + + // if a task is moved into the cgroup and a value has not been set for cpus and mems + // Errno 28 (no space left on device) will be returned. Therefore we set the value from the parent if required. + fn ensure_not_empty(cgroup_path: &Path, interface_file: &str) -> Result<()> { + let mut current = util::get_subsystem_mount_point(&ControllerType::CpuSet)?; + let relative_cgroup_path = cgroup_path.strip_prefix(¤t)?; + + for component in relative_cgroup_path.components() { + let parent_value = fs::read_to_string(current.join(interface_file))?; + if parent_value.trim().is_empty() { + bail!("cpuset parent value is empty") + } + + current.push(component); + let child_path = current.join(interface_file); + let child_value = fs::read_to_string(&child_path)?; + // the file can contain a newline character. Need to trim it away, + // otherwise it is not considered empty and value will not be written + if child_value.trim().is_empty() { + common::write_cgroup_file_str(&child_path, &parent_value)?; + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::*; + use crate::cgroups::test::{setup, LinuxCpuBuilder}; + + #[test] + fn test_set_cpus() { + // arrange + let (tmp, cpus) = setup("test_set_cpus", CGROUP_CPUSET_CPUS); + let cpuset = LinuxCpuBuilder::new().with_cpus("1-3".to_owned()).build(); + + // act + CpuSet::apply(&tmp, &cpuset).expect("apply cpuset"); + + // assert + let content = fs::read_to_string(&cpus) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPUSET_CPUS)); + assert_eq!(content, "1-3"); + } + + #[test] + fn test_set_mems() { + // arrange + let (tmp, mems) = setup("test_set_mems", CGROUP_CPUSET_MEMS); + let cpuset = LinuxCpuBuilder::new().with_mems("1-3".to_owned()).build(); + + // act + CpuSet::apply(&tmp, &cpuset).expect("apply cpuset"); + + // assert + let content = fs::read_to_string(&mems) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPUSET_MEMS)); + assert_eq!(content, "1-3"); + } +} diff --git a/src/cgroups/v1/devices.rs b/src/cgroups/v1/devices.rs new file mode 100644 index 0000000000..3e5f12705c --- /dev/null +++ b/src/cgroups/v1/devices.rs @@ -0,0 +1,226 @@ +use std::path::Path; + +use anyhow::Result; + +use crate::cgroups::common; +use crate::{cgroups::v1::Controller, rootfs::default_devices}; +use oci_spec::{LinuxDeviceCgroup, LinuxDeviceType, LinuxResources}; + +pub struct Devices {} + +impl Controller for Devices { + type Resource = (); + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply Devices cgroup config"); + + for d in &linux_resources.devices { + Self::apply_device(d, cgroup_root)?; + } + + for d in [ + default_devices().iter().map(|d| d.into()).collect(), + Self::default_allow_devices(), + ] + .concat() + { + Self::apply_device(&d, &cgroup_root)?; + } + + Ok(()) + } + + // always needs to be called due to default devices + fn needs_to_handle(_linux_resources: &LinuxResources) -> Option<&Self::Resource> { + Some(&()) + } +} + +impl Devices { + fn apply_device(device: &LinuxDeviceCgroup, cgroup_root: &Path) -> Result<()> { + let path = if device.allow { + cgroup_root.join("devices.allow") + } else { + cgroup_root.join("devices.deny") + }; + + common::write_cgroup_file_str(path, &device.to_string())?; + Ok(()) + } + + fn default_allow_devices() -> Vec { + vec![ + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: None, + minor: None, + access: "m".to_string(), + }, + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::B, + major: None, + minor: None, + access: "m".to_string(), + }, + // /dev/console + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: Some(5), + minor: Some(1), + access: "rwm".to_string(), + }, + // /dev/pts + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: Some(136), + minor: None, + access: "rwm".to_string(), + }, + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: Some(5), + minor: Some(2), + access: "rwm".to_string(), + }, + // tun/tap + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: Some(10), + minor: Some(200), + access: "rwm".to_string(), + }, + ] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::{LinuxDeviceCgroup, LinuxDeviceType}; + use std::fs::read_to_string; + + #[test] + fn test_set_default_devices() { + let tmp = + create_temp_dir("test_set_default_devices").expect("create temp directory for test"); + + Devices::default_allow_devices().iter().for_each(|d| { + // NOTE: We reset the fixtures every iteration because files aren't appended + // so what happens in the tests is you get strange overwrites which can contain + // remaining bytes from the last iteration. Resetting the files more appropriately + // mocks the behavior of cgroup files. + set_fixture(&tmp, "devices.allow", "").expect("create allowed devices list"); + set_fixture(&tmp, "devices.deny", "").expect("create denied devices list"); + + Devices::apply_device(&d, &tmp).expect("Apply default device"); + println!("Device: {}", d.to_string()); + if d.allow { + let allowed_content = + read_to_string(tmp.join("devices.allow")).expect("read to string"); + assert_eq!(allowed_content, d.to_string()); + } else { + let denied_content = + read_to_string(tmp.join("devices.deny")).expect("read to string"); + assert_eq!(denied_content, d.to_string()); + } + }); + } + + #[test] + fn test_set_mock_devices() { + let tmp = create_temp_dir("test_set_mock_devices").expect("create temp directory for test"); + [ + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::C, + major: Some(10), + minor: None, + access: "rwm".to_string(), + }, + LinuxDeviceCgroup { + allow: true, + typ: LinuxDeviceType::A, + major: None, + minor: Some(200), + access: "rwm".to_string(), + }, + LinuxDeviceCgroup { + allow: false, + typ: LinuxDeviceType::P, + major: Some(10), + minor: Some(200), + access: "m".to_string(), + }, + LinuxDeviceCgroup { + allow: false, + typ: LinuxDeviceType::U, + major: None, + minor: None, + access: "rw".to_string(), + }, + ] + .iter() + .for_each(|d| { + set_fixture(&tmp, "devices.allow", "").expect("create allowed devices list"); + set_fixture(&tmp, "devices.deny", "").expect("create denied devices list"); + + Devices::apply_device(&d, &tmp).expect("Apply default device"); + println!("Device: {}", d.to_string()); + if d.allow { + let allowed_content = + read_to_string(tmp.join("devices.allow")).expect("read to string"); + assert_eq!(allowed_content, d.to_string()); + } else { + let denied_content = + read_to_string(tmp.join("devices.deny")).expect("read to string"); + assert_eq!(denied_content, d.to_string()); + } + }); + } + + quickcheck! { + fn property_test_apply_device(device: LinuxDeviceCgroup) -> bool { + let tmp = create_temp_dir("property_test_apply_device").expect("create temp directory for test"); + set_fixture(&tmp, "devices.allow", "").expect("create allowed devices list"); + set_fixture(&tmp, "devices.deny", "").expect("create denied devices list"); + Devices::apply_device(&device, &tmp).expect("Apply default device"); + if device.allow { + let allowed_content = + read_to_string(tmp.join("devices.allow")).expect("read to string"); + allowed_content == device.to_string() + } else { + let denied_content = + read_to_string(tmp.join("devices.deny")).expect("read to string"); + denied_content == device.to_string() + } + } + + fn property_test_apply_multiple_devices(devices: Vec) -> bool { + let tmp = create_temp_dir("property_test_apply_multiple_devices").expect("create temp directory for test"); + devices.iter() + .map(|device| { + set_fixture(&tmp, "devices.allow", "").expect("create allowed devices list"); + set_fixture(&tmp, "devices.deny", "").expect("create denied devices list"); + Devices::apply_device(&device, &tmp).expect("Apply default device"); + if device.allow { + let allowed_content = + read_to_string(tmp.join("devices.allow")).expect("read to string"); + allowed_content == device.to_string() + } else { + let denied_content = + read_to_string(tmp.join("devices.deny")).expect("read to string"); + denied_content == device.to_string() + } + }) + .all(|is_ok| is_ok) + } + } +} diff --git a/src/cgroups/v1/freezer.rs b/src/cgroups/v1/freezer.rs new file mode 100644 index 0000000000..4a4dd090cf --- /dev/null +++ b/src/cgroups/v1/freezer.rs @@ -0,0 +1,258 @@ +use std::io::prelude::*; +use std::{ + fs::{create_dir_all, OpenOptions}, + path::Path, + thread, time, +}; + +use anyhow::{Result, *}; + +use crate::cgroups::common; +use crate::cgroups::v1::Controller; +use oci_spec::{FreezerState, LinuxResources}; + +const CGROUP_FREEZER_STATE: &str = "freezer.state"; +const FREEZER_STATE_THAWED: &str = "THAWED"; +const FREEZER_STATE_FROZEN: &str = "FROZEN"; +const FREEZER_STATE_FREEZING: &str = "FREEZING"; + +pub struct Freezer {} + +impl Controller for Freezer { + type Resource = FreezerState; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply Freezer cgroup config"); + create_dir_all(&cgroup_root)?; + + if let Some(freezer_state) = Self::needs_to_handle(linux_resources) { + Self::apply(freezer_state, cgroup_root)?; + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(freezer_state) = &linux_resources.freezer { + return Some(freezer_state); + } + + None + } +} + +impl Freezer { + fn apply(freezer_state: &FreezerState, cgroup_root: &Path) -> Result<()> { + match freezer_state { + FreezerState::Undefined => {} + FreezerState::Thawed => { + common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + )?; + } + FreezerState::Frozen => { + let r = || -> Result<()> { + // We should do our best to retry if FREEZING is seen until it becomes FROZEN. + // Add sleep between retries occasionally helped when system is extremely slow. + // see: + // https://github.com/opencontainers/runc/blob/b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7/libcontainer/cgroups/fs/freezer.go#L42 + for i in 0..1000 { + if i % 50 == 49 { + let _ = common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + ); + thread::sleep(time::Duration::from_millis(10)); + } + + common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_FROZEN, + )?; + + if i % 25 == 24 { + thread::sleep(time::Duration::from_millis(10)); + } + + let r = Self::read_freezer_state(cgroup_root)?; + match r.trim() { + FREEZER_STATE_FREEZING => { + continue; + } + FREEZER_STATE_FROZEN => { + if i > 1 { + log::debug!("frozen after {} retries", i) + } + return Ok(()); + } + _ => { + // should not reach here. + bail!("unexpected state {} while freezing", r.trim()); + } + } + } + bail!("unbale to freeze"); + }(); + + if r.is_err() { + // Freezing failed, and it is bad and dangerous to leave the cgroup in FROZEN or + // FREEZING, so try to thaw it back. + let _ = common::write_cgroup_file( + cgroup_root.join(CGROUP_FREEZER_STATE), + FREEZER_STATE_THAWED, + ); + } + return r; + } + } + Ok(()) + } + + fn read_freezer_state(cgroup_root: &Path) -> Result { + let path = cgroup_root.join(CGROUP_FREEZER_STATE); + let mut content = String::new(); + OpenOptions::new() + .create(false) + .read(true) + .open(path)? + .read_to_string(&mut content)?; + Ok(content) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::common::CGROUP_PROCS; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use nix::unistd::Pid; + use oci_spec::FreezerState; + + #[test] + fn test_set_freezer_state() { + let tmp = + create_temp_dir("test_set_freezer_state").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZER_STATE, "").expect("Set fixure for freezer state"); + + // set Frozen state. + { + let freezer_state = FreezerState::Frozen; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(FREEZER_STATE_FROZEN, state_content); + } + + // set Thawed state. + { + let freezer_state = FreezerState::Thawed; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(FREEZER_STATE_THAWED, state_content); + } + + // set Undefined state. + { + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + let freezer_state = FreezerState::Undefined; + Freezer::apply(&freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("Read to string"); + assert_eq!(old_state_content, state_content); + } + } + + #[test] + fn test_add_and_apply() { + let tmp = create_temp_dir("test_add_task").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZER_STATE, "").expect("set fixure for freezer state"); + set_fixture(&tmp, CGROUP_PROCS, "").expect("set fixture for proc file"); + + // set Thawed state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Thawed), + }; + + let pid = Pid::from_raw(1000); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(FREEZER_STATE_THAWED, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1000"); + } + + // set Frozen state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Frozen), + }; + + let pid = Pid::from_raw(1001); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(FREEZER_STATE_FROZEN, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1001"); + } + + // set Undefined state. + { + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer: false, + oom_score_adj: None, + memory: None, + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: Some(FreezerState::Undefined), + }; + + let pid = Pid::from_raw(1002); + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + Freezer::add_task(pid, &tmp).expect("freezer add task"); + ::apply(&linux_resources, &tmp).expect("freezer apply"); + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZER_STATE)).expect("read to string"); + assert_eq!(old_state_content, state_content); + let pid_content = + std::fs::read_to_string(tmp.join(CGROUP_PROCS)).expect("read to string"); + assert_eq!(pid_content, "1002"); + } + } +} diff --git a/src/cgroups/v1/hugetlb.rs b/src/cgroups/v1/hugetlb.rs new file mode 100644 index 0000000000..3e35512855 --- /dev/null +++ b/src/cgroups/v1/hugetlb.rs @@ -0,0 +1,122 @@ +use std::path::Path; + +use anyhow::{bail, Result}; + +use crate::cgroups::{common, v1::Controller}; +use oci_spec::{LinuxHugepageLimit, LinuxResources}; + +pub struct Hugetlb {} + +impl Controller for Hugetlb { + type Resource = Vec; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { + log::debug!("Apply Hugetlb cgroup config"); + + if let Some(hugepage_limits) = Self::needs_to_handle(linux_resources) { + for hugetlb in hugepage_limits { + Self::apply(cgroup_root, hugetlb)? + } + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if !linux_resources.hugepage_limits.is_empty() { + return Some(&linux_resources.hugepage_limits); + } + + None + } +} + +impl Hugetlb { + fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> Result<()> { + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse()?; + if !Self::is_power_of_two(page_size) { + bail!("page size must be in the format of 2^(integer)"); + } + + common::write_cgroup_file( + root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), + hugetlb.limit, + )?; + Ok(()) + } + + fn is_power_of_two(number: u64) -> bool { + (number != 0) && (number & (number - 1)) == 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxHugepageLimit; + use std::fs::read_to_string; + + #[test] + fn test_set_hugetlb() { + let page_file_name = "hugetlb.2MB.limit_in_bytes"; + let tmp = create_temp_dir("test_set_hugetlb").expect("create temp directory for test"); + set_fixture(&tmp, page_file_name, "0").expect("Set fixture for 2 MB page size"); + + let hugetlb = LinuxHugepageLimit { + page_size: "2MB".to_owned(), + limit: 16384, + }; + Hugetlb::apply(&tmp, &hugetlb).expect("apply hugetlb"); + let content = read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + assert_eq!(hugetlb.limit.to_string(), content); + } + + #[test] + fn test_set_hugetlb_with_invalid_page_size() { + let tmp = create_temp_dir("test_set_hugetlb_with_invalid_page_size") + .expect("create temp directory for test"); + + let hugetlb = LinuxHugepageLimit { + page_size: "3MB".to_owned(), + limit: 16384, + }; + + let result = Hugetlb::apply(&tmp, &hugetlb); + assert!( + result.is_err(), + "page size that is not a power of two should be an error" + ); + } + + quickcheck! { + fn property_test_set_hugetlb(hugetlb: LinuxHugepageLimit) -> bool { + let page_file_name = format!("hugetlb.{:?}.limit_in_bytes", hugetlb.page_size); + let tmp = create_temp_dir("property_test_set_hugetlb").expect("create temp directory for test"); + set_fixture(&tmp, &page_file_name, "0").expect("Set fixture for page size"); + + let result = Hugetlb::apply(&tmp, &hugetlb); + + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse().expect("parse page size"); + + if Hugetlb::is_power_of_two(page_size) && page_size != 1 { + let content = + read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + hugetlb.limit.to_string() == content + } else { + result.is_err() + } + } + } +} diff --git a/src/cgroups/v1/manager.rs b/src/cgroups/v1/manager.rs new file mode 100644 index 0000000000..13e9cc9ad6 --- /dev/null +++ b/src/cgroups/v1/manager.rs @@ -0,0 +1,173 @@ +use std::fs; +use std::path::Path; +use std::{collections::HashMap, path::PathBuf}; + +use anyhow::bail; +use anyhow::Result; +use nix::unistd::Pid; + +use procfs::process::Process; + +use super::ControllerType as CtrlType; +use super::{ + blkio::Blkio, controller_type::CONTROLLERS, cpu::Cpu, cpuacct::CpuAcct, cpuset::CpuSet, + devices::Devices, freezer::Freezer, hugetlb::Hugetlb, memory::Memory, + network_classifier::NetworkClassifier, network_priority::NetworkPriority, pids::Pids, util, + Controller, +}; + +use crate::cgroups::common::CGROUP_PROCS; +use crate::utils; +use crate::{cgroups::common::CgroupManager, utils::PathBufExt}; +use oci_spec::{FreezerState, LinuxResources}; +pub struct Manager { + subsystems: HashMap, +} + +impl Manager { + /// Constructs a new cgroup manager with cgroups_path being relative to the root of the subsystem + pub fn new(cgroup_path: PathBuf) -> Result { + let mut subsystems = HashMap::::new(); + for subsystem in CONTROLLERS { + if let Ok(subsystem_path) = Self::get_subsystem_path(&cgroup_path, subsystem) { + subsystems.insert(subsystem.clone(), subsystem_path); + } else { + log::warn!("Cgroup {} not supported on this system", subsystem); + } + } + + Ok(Manager { subsystems }) + } + + fn get_subsystem_path(cgroup_path: &Path, subsystem: &CtrlType) -> Result { + log::debug!("Get path for subsystem: {}", subsystem); + let mount_point = util::get_subsystem_mount_point(subsystem)?; + + let cgroup = Process::myself()? + .cgroups()? + .into_iter() + .find(|c| c.controllers.contains(&subsystem.to_string())) + .unwrap(); + + let p = if cgroup_path.to_string_lossy().into_owned().is_empty() { + mount_point.join_absolute_path(Path::new(&cgroup.pathname))? + } else if cgroup_path.is_absolute() { + mount_point.join_absolute_path(&cgroup_path)? + } else { + mount_point.join(cgroup_path) + }; + + Ok(p) + } + + fn get_required_controllers( + &self, + linux_resources: &LinuxResources, + ) -> Result> { + let mut required_controllers = HashMap::new(); + + for controller in CONTROLLERS { + let required = match controller { + CtrlType::Cpu => Cpu::needs_to_handle(linux_resources).is_some(), + CtrlType::CpuAcct => CpuAcct::needs_to_handle(linux_resources).is_some(), + CtrlType::CpuSet => CpuSet::needs_to_handle(linux_resources).is_some(), + CtrlType::Devices => Devices::needs_to_handle(linux_resources).is_some(), + CtrlType::HugeTlb => Hugetlb::needs_to_handle(linux_resources).is_some(), + CtrlType::Memory => Memory::needs_to_handle(linux_resources).is_some(), + CtrlType::Pids => Pids::needs_to_handle(linux_resources).is_some(), + CtrlType::Blkio => Blkio::needs_to_handle(linux_resources).is_some(), + CtrlType::NetworkPriority => { + NetworkPriority::needs_to_handle(linux_resources).is_some() + } + CtrlType::NetworkClassifier => { + NetworkClassifier::needs_to_handle(linux_resources).is_some() + } + CtrlType::Freezer => Freezer::needs_to_handle(linux_resources).is_some(), + }; + + if required { + if let Some(subsystem_path) = self.subsystems.get(controller) { + required_controllers.insert(controller, subsystem_path); + } else { + bail!("Cgroup {} is required to fullfill the request, but is not supported by this system", controller); + } + } + } + + Ok(required_controllers) + } +} + +impl CgroupManager for Manager { + fn add_task(&self, pid: Pid) -> Result<()> { + for subsys in &self.subsystems { + match subsys.0 { + CtrlType::Cpu => Cpu::add_task(pid, subsys.1)?, + CtrlType::CpuAcct => CpuAcct::add_task(pid, subsys.1)?, + CtrlType::CpuSet => CpuSet::add_task(pid, subsys.1)?, + CtrlType::Devices => Devices::add_task(pid, subsys.1)?, + CtrlType::HugeTlb => Hugetlb::add_task(pid, subsys.1)?, + CtrlType::Memory => Memory::add_task(pid, subsys.1)?, + CtrlType::Pids => Pids::add_task(pid, subsys.1)?, + CtrlType::Blkio => Blkio::add_task(pid, subsys.1)?, + CtrlType::NetworkPriority => NetworkPriority::add_task(pid, subsys.1)?, + CtrlType::NetworkClassifier => NetworkClassifier::add_task(pid, subsys.1)?, + CtrlType::Freezer => Freezer::add_task(pid, subsys.1)?, + } + } + + Ok(()) + } + + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { + for subsys in self.get_required_controllers(linux_resources)? { + match subsys.0 { + CtrlType::Cpu => Cpu::apply(linux_resources, &subsys.1)?, + CtrlType::CpuAcct => CpuAcct::apply(linux_resources, &subsys.1)?, + CtrlType::CpuSet => CpuSet::apply(linux_resources, &subsys.1)?, + CtrlType::Devices => Devices::apply(linux_resources, &subsys.1)?, + CtrlType::HugeTlb => Hugetlb::apply(linux_resources, &subsys.1)?, + CtrlType::Memory => Memory::apply(linux_resources, &subsys.1)?, + CtrlType::Pids => Pids::apply(linux_resources, &subsys.1)?, + CtrlType::Blkio => Blkio::apply(linux_resources, &subsys.1)?, + CtrlType::NetworkPriority => NetworkPriority::apply(linux_resources, &subsys.1)?, + CtrlType::NetworkClassifier => { + NetworkClassifier::apply(linux_resources, &subsys.1)? + } + CtrlType::Freezer => Freezer::apply(linux_resources, &subsys.1)?, + } + } + + Ok(()) + } + + fn remove(&self) -> Result<()> { + for cgroup_path in &self.subsystems { + if cgroup_path.1.exists() { + log::debug!("remove cgroup {:?}", cgroup_path.1); + let procs_path = cgroup_path.1.join(CGROUP_PROCS); + let procs = fs::read_to_string(&procs_path)?; + + for line in procs.lines() { + let pid: i32 = line.parse()?; + let _ = nix::sys::signal::kill(Pid::from_raw(pid), nix::sys::signal::SIGKILL); + } + + utils::delete_with_retry(cgroup_path.1)?; + } + } + + Ok(()) + } + + fn freeze(&self, state: FreezerState) -> Result<()> { + let linux_resources = LinuxResources { + freezer: Some(state), + ..Default::default() + }; + Freezer::apply( + &linux_resources, + &self.subsystems.get(&CtrlType::Freezer).unwrap(), + ) + } +} diff --git a/src/cgroups/memory.rs b/src/cgroups/v1/memory.rs similarity index 58% rename from src/cgroups/memory.rs rename to src/cgroups/v1/memory.rs index 696f7455d7..641fd0eead 100644 --- a/src/cgroups/memory.rs +++ b/src/cgroups/v1/memory.rs @@ -1,15 +1,11 @@ use std::io::{prelude::*, Write}; -use std::{ - fs::{create_dir_all, OpenOptions}, - path::Path, -}; +use std::{fs::OpenOptions, path::Path}; use anyhow::{Result, *}; -use nix::{errno::Errno, unistd::Pid}; +use nix::errno::Errno; -use crate::{ - cgroups::Controller, -}; +use crate::cgroups::common::{self}; +use crate::cgroups::v1::Controller; use oci_spec::{LinuxMemory, LinuxResources}; const CGROUP_MEMORY_SWAP_LIMIT: &str = "memory.memsw.limit_in_bytes"; @@ -26,28 +22,35 @@ const CGROUP_KERNEL_TCP_MEMORY_LIMIT: &str = "memory.kmem.tcp.limit_in_bytes"; pub struct Memory {} impl Controller for Memory { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxMemory; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply Memory cgroup config"); - create_dir_all(&cgroup_root)?; - if let Some(memory) = &linux_resources.memory { + if let Some(memory) = Self::needs_to_handle(linux_resources) { let reservation = memory.reservation.unwrap_or(0); Self::apply(&memory, cgroup_root)?; if reservation != 0 { - Self::set(reservation, &cgroup_root.join(CGROUP_MEMORY_RESERVATION))?; + common::write_cgroup_file( + cgroup_root.join(CGROUP_MEMORY_RESERVATION), + reservation, + )?; } if linux_resources.disable_oom_killer { - Self::set(0, &cgroup_root.join(CGROUP_MEMORY_OOM_CONTROL))?; + common::write_cgroup_file(cgroup_root.join(CGROUP_MEMORY_OOM_CONTROL), 0)?; } else { - Self::set(1, &cgroup_root.join(CGROUP_MEMORY_OOM_CONTROL))?; + common::write_cgroup_file(cgroup_root.join(CGROUP_MEMORY_OOM_CONTROL), 1)?; } if let Some(swappiness) = memory.swappiness { if swappiness <= 100 { - Self::set(swappiness, &cgroup_root.join(CGROUP_MEMORY_SWAPPINESS))?; + common::write_cgroup_file( + cgroup_root.join(CGROUP_MEMORY_SWAPPINESS), + swappiness, + )?; } else { // invalid swappiness value return Err(anyhow!( @@ -61,21 +64,26 @@ impl Controller for Memory { // neither are implemented by runc. Tests pass without this, but // kept in per the spec. if let Some(kmem) = memory.kernel { - Self::set(kmem, &cgroup_root.join(CGROUP_KERNEL_MEMORY_LIMIT))?; + common::write_cgroup_file(cgroup_root.join(CGROUP_KERNEL_MEMORY_LIMIT), kmem)?; } if let Some(tcp_mem) = memory.kernel_tcp { - Self::set(tcp_mem, &cgroup_root.join(CGROUP_KERNEL_TCP_MEMORY_LIMIT))?; + common::write_cgroup_file( + cgroup_root.join(CGROUP_KERNEL_TCP_MEMORY_LIMIT), + tcp_mem, + )?; } - - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; } + Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(memory) = &linux_resources.memory { + return Some(memory); + } + + None + } } impl Memory { @@ -176,15 +184,12 @@ impl Memory { } } - fn set_swap(val: i64, cgroup_root: &Path) -> Result<()> { - if val == 0 { + fn set_swap(swap: i64, cgroup_root: &Path) -> Result<()> { + if swap == 0 { return Ok(()); } - let path = cgroup_root.join(CGROUP_MEMORY_SWAP_LIMIT); - - Self::set(val, &path)?; - + common::write_cgroup_file(cgroup_root.join(CGROUP_MEMORY_SWAP_LIMIT), swap)?; Ok(()) } @@ -239,24 +244,11 @@ impl Memory { #[cfg(test)] mod tests { use super::*; + use crate::cgroups::common::CGROUP_PROCS; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxMemory; - fn set_fixture(temp_dir: &std::path::Path, filename: &str, val: &str) -> Result<()> { - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(temp_dir.join(filename))? - .write_all(val.as_bytes())?; - - Ok(()) - } - - fn create_temp_dir(test_name: &str) -> Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - #[test] fn test_set_memory() { let limit = 1024; @@ -349,4 +341,134 @@ mod tests { assert_eq!(swap.to_string(), swap_content); } } + + quickcheck! { + fn property_test_set_memory(linux_memory: LinuxMemory, disable_oom_killer: bool) -> bool { + let tmp = + create_temp_dir("property_test_set_memory").expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_MEMORY_USAGE, "0").expect("Set fixure for memory usage"); + set_fixture(&tmp, CGROUP_MEMORY_MAX_USAGE, "0").expect("Set fixure for max memory usage"); + set_fixture(&tmp, CGROUP_MEMORY_LIMIT, "0").expect("Set fixure for memory limit"); + set_fixture(&tmp, CGROUP_MEMORY_SWAP_LIMIT, "0").expect("Set fixure for swap limit"); + set_fixture(&tmp, CGROUP_MEMORY_SWAPPINESS, "0").expect("Set fixure for swappiness"); + set_fixture(&tmp, CGROUP_MEMORY_RESERVATION, "0").expect("Set fixture for memory reservation"); + set_fixture(&tmp, CGROUP_MEMORY_OOM_CONTROL, "0").expect("Set fixture for oom control"); + set_fixture(&tmp, CGROUP_KERNEL_MEMORY_LIMIT, "0").expect("Set fixture for kernel memory limit"); + set_fixture(&tmp, CGROUP_KERNEL_TCP_MEMORY_LIMIT, "0").expect("Set fixture for kernel tcp memory limit"); + set_fixture(&tmp, CGROUP_PROCS, "").expect("set fixture for proc file"); + + + // clone to avoid use of moved value later on + let memory_limits = linux_memory.clone(); + + let linux_resources = LinuxResources { + devices: vec![], + disable_oom_killer, + oom_score_adj: None, // current unused + memory: Some(linux_memory), + cpu: None, + pids: None, + block_io: None, + hugepage_limits: vec![], + network: None, + freezer: None, + }; + + let result = ::apply(&linux_resources, &tmp); + + if result.is_err() { + if let Some(swappiness) = memory_limits.swappiness { + // error is expected if swappiness is greater than 100 + if swappiness > 100 { + return true; + } + } else { + // useful for debugging + println!("Some unexpected error: {:?}", result.unwrap_err()); + // any other error should be considered unexpected + return false; + } + } + + // check memory reservation + let reservation_content = std::fs::read_to_string(tmp.join(CGROUP_MEMORY_RESERVATION)).expect("read memory reservation"); + let reservation_check = match memory_limits.reservation { + Some(reservation) => { + reservation_content == reservation.to_string() + } + None => reservation_content == "0", + }; + + // check kernel memory limit + let kernel_content = std::fs::read_to_string(tmp.join(CGROUP_KERNEL_MEMORY_LIMIT)).expect("read kernel memory limit"); + let kernel_check = match memory_limits.kernel { + Some(kernel) => { + kernel_content == kernel.to_string() + } + None => kernel_content == "0", + }; + + // check kernel tcp memory limit + let kernel_tcp_content = std::fs::read_to_string(tmp.join(CGROUP_KERNEL_TCP_MEMORY_LIMIT)).expect("read kernel tcp memory limit"); + let kernel_tcp_check = match memory_limits.kernel_tcp { + Some(kernel_tcp) => { + kernel_tcp_content == kernel_tcp.to_string() + } + None => kernel_tcp_content == "0", + }; + + // check swappiness + let swappiness_content = std::fs::read_to_string(tmp.join(CGROUP_MEMORY_SWAPPINESS)).expect("read swappiness"); + let swappiness_check = match memory_limits.swappiness { + Some(swappiness) if swappiness <= 100 => { + swappiness_content == swappiness.to_string() + } + None => swappiness_content == "0", + // everything else is a failure + _ => false, + }; + + // check limit and swap + let limit_content = std::fs::read_to_string(tmp.join(CGROUP_MEMORY_LIMIT)).expect("read memory limit"); + let swap_content = std::fs::read_to_string(tmp.join(CGROUP_MEMORY_SWAP_LIMIT)).expect("read swap memory limit"); + let limit_swap_check = match memory_limits.limit { + Some(limit) => { + match memory_limits.swap { + Some(swap) => { + limit_content == limit.to_string() + && swap_content == swap.to_string() + } + None => { + if limit == -1 { + limit_content == limit.to_string() + && swap_content == "-1" + } else { + limit_content == limit.to_string() + && swap_content == "0" + } + } + } + } + None => { + match memory_limits.swap { + Some(swap) => { + limit_content == "0" + && swap_content == swap.to_string() + } + None => limit_content == "0" && swap_content == "0" + } + } + }; + + // useful for debugging + println!("reservation_check: {:?}", reservation_check); + println!("kernel_check: {:?}", kernel_check); + println!("kernel_tcp_check: {:?}", kernel_tcp_check); + println!("swappiness_check: {:?}", swappiness_check); + println!("limit_swap_check: {:?}", limit_swap_check); + + // combine all the checks + reservation_check && kernel_check && kernel_tcp_check && swappiness_check && limit_swap_check + } + } } diff --git a/src/cgroups/v1/mod.rs b/src/cgroups/v1/mod.rs new file mode 100644 index 0000000000..ff18551439 --- /dev/null +++ b/src/cgroups/v1/mod.rs @@ -0,0 +1,18 @@ +mod blkio; +mod controller; +mod controller_type; +mod cpu; +mod cpuacct; +mod cpuset; +mod devices; +mod freezer; +mod hugetlb; +pub mod manager; +mod memory; +mod network_classifier; +mod network_priority; +mod pids; +pub mod util; +pub use controller::Controller; +pub use controller_type::ControllerType; +pub use manager::Manager; diff --git a/src/cgroups/v1/network_classifier.rs b/src/cgroups/v1/network_classifier.rs new file mode 100644 index 0000000000..551fc67269 --- /dev/null +++ b/src/cgroups/v1/network_classifier.rs @@ -0,0 +1,67 @@ +use std::path::Path; + +use anyhow::Result; + +use crate::cgroups::common; +use crate::cgroups::v1::Controller; +use oci_spec::{LinuxNetwork, LinuxResources}; + +pub struct NetworkClassifier {} + +impl Controller for NetworkClassifier { + type Resource = LinuxNetwork; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply NetworkClassifier cgroup config"); + + if let Some(network) = Self::needs_to_handle(linux_resources) { + Self::apply(cgroup_root, network)?; + } + + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(network) = &linux_resources.network { + return Some(network); + } + + None + } +} + +impl NetworkClassifier { + fn apply(root_path: &Path, network: &LinuxNetwork) -> Result<()> { + if let Some(class_id) = network.class_id { + common::write_cgroup_file(root_path.join("net_cls.classid"), class_id)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + + #[test] + fn test_apply_network_classifier() { + let tmp = create_temp_dir("test_apply_network_classifier") + .expect("create temp directory for test"); + set_fixture(&tmp, "net_cls.classid", "0").expect("set fixture for classID"); + + let id = 0x100001; + let network = LinuxNetwork { + class_id: Some(id), + priorities: vec![], + }; + + NetworkClassifier::apply(&tmp, &network).expect("apply network classID"); + + let content = + std::fs::read_to_string(tmp.join("net_cls.classid")).expect("Read classID contents"); + assert_eq!(id.to_string(), content); + } +} diff --git a/src/cgroups/network_priority.rs b/src/cgroups/v1/network_priority.rs similarity index 54% rename from src/cgroups/network_priority.rs rename to src/cgroups/v1/network_priority.rs index 291ca9df5a..63683bc3cf 100644 --- a/src/cgroups/network_priority.rs +++ b/src/cgroups/v1/network_priority.rs @@ -1,52 +1,39 @@ -use std::io::Write; -use std::{ - fs::{create_dir_all, OpenOptions}, - path::Path, -}; +use std::path::Path; use anyhow::Result; -use nix::unistd::Pid; -use crate::cgroups::Controller; +use crate::cgroups::common; +use crate::cgroups::v1::Controller; use oci_spec::{LinuxNetwork, LinuxResources}; pub struct NetworkPriority {} impl Controller for NetworkPriority { - fn apply(linux_resources: &LinuxResources, cgroup_root: &Path, pid: Pid) -> Result<()> { + type Resource = LinuxNetwork; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { log::debug!("Apply NetworkPriority cgroup config"); - create_dir_all(&cgroup_root)?; - if let Some(network) = linux_resources.network.as_ref() { + if let Some(network) = Self::needs_to_handle(linux_resources) { Self::apply(cgroup_root, network)?; - - OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; } Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(network) = &linux_resources.network { + return Some(network); + } + + None + } } impl NetworkPriority { fn apply(root_path: &Path, network: &LinuxNetwork) -> Result<()> { let priorities: String = network.priorities.iter().map(|p| p.to_string()).collect(); - Self::write_file(&root_path.join("net_prio.ifpriomap"), &priorities.trim())?; - - Ok(()) - } - - fn write_file(file_path: &Path, data: &str) -> Result<()> { - OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(file_path)? - .write_all(data.as_bytes())?; + common::write_cgroup_file_str(root_path.join("net_prio.ifpriomap"), &priorities.trim())?; Ok(()) } @@ -54,27 +41,11 @@ impl NetworkPriority { #[cfg(test)] mod tests { - use std::path::PathBuf; - use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxInterfacePriority; - fn set_fixture(temp_dir: &std::path::Path, filename: &str, val: &str) -> Result<()> { - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(temp_dir.join(filename))? - .write_all(val.as_bytes())?; - - Ok(()) - } - - fn create_temp_dir(test_name: &str) -> Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - #[test] fn test_apply_network_priorites() { let tmp = create_temp_dir("test_apply_network_priorites") diff --git a/src/cgroups/pids.rs b/src/cgroups/v1/pids.rs similarity index 51% rename from src/cgroups/pids.rs rename to src/cgroups/v1/pids.rs index df6d5b9c9f..025bed7d9a 100644 --- a/src/cgroups/pids.rs +++ b/src/cgroups/v1/pids.rs @@ -1,38 +1,32 @@ -use std::{ - fs::{self, OpenOptions}, - io::Write, - path::Path, -}; +use std::path::Path; use anyhow::Result; -use crate::{ - cgroups::Controller, -}; +use crate::cgroups::{common, v1::Controller}; use oci_spec::{LinuxPids, LinuxResources}; pub struct Pids {} impl Controller for Pids { - fn apply( - linux_resources: &LinuxResources, - cgroup_root: &std::path::Path, - pid: nix::unistd::Pid, - ) -> anyhow::Result<()> { - fs::create_dir_all(cgroup_root)?; - - for pids in &linux_resources.pids { - Self::apply(cgroup_root, pids)? + type Resource = LinuxPids; + + fn apply(linux_resources: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply pids cgroup config"); + + if let Some(pids) = &linux_resources.pids { + Self::apply(cgroup_root, pids)?; } - OpenOptions::new() - .create(false) - .write(true) - .truncate(false) - .open(cgroup_root.join("cgroup.procs"))? - .write_all(pid.to_string().as_bytes())?; Ok(()) } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Self::Resource> { + if let Some(pids) = &linux_resources.pids { + return Some(pids); + } + + None + } } impl Pids { @@ -43,18 +37,7 @@ impl Pids { "max".to_string() }; - Self::write_file(&root_path.join("pids.max"), &limit)?; - Ok(()) - } - - fn write_file(file_path: &Path, data: &str) -> Result<()> { - fs::OpenOptions::new() - .create(false) - .write(true) - .truncate(true) - .open(file_path)? - .write_all(data.as_bytes())?; - + common::write_cgroup_file_str(&root_path.join("pids.max"), &limit)?; Ok(()) } } @@ -62,24 +45,10 @@ impl Pids { #[cfg(test)] mod tests { use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; use oci_spec::LinuxPids; - fn set_fixture(temp_dir: &std::path::Path, filename: &str, val: &str) -> Result<()> { - std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(temp_dir.join(filename))? - .write_all(val.as_bytes())?; - - Ok(()) - } - - fn create_temp_dir(test_name: &str) -> Result { - std::fs::create_dir_all(std::env::temp_dir().join(test_name))?; - Ok(std::env::temp_dir().join(test_name)) - } - #[test] fn test_set_pids() { let pids_file_name = "pids.max"; diff --git a/src/cgroups/v1/util.rs b/src/cgroups/v1/util.rs new file mode 100644 index 0000000000..389bbddba3 --- /dev/null +++ b/src/cgroups/v1/util.rs @@ -0,0 +1,53 @@ +use std::{collections::HashMap, path::PathBuf}; + +use anyhow::{anyhow, Result}; +use procfs::process::Process; + +use super::{controller_type::CONTROLLERS, ControllerType}; + +pub fn list_subsystem_mount_points() -> Result> { + let mut mount_paths = HashMap::with_capacity(CONTROLLERS.len()); + + for controller in CONTROLLERS { + if let Ok(mount_point) = get_subsystem_mount_point(controller) { + mount_paths.insert(controller.to_owned(), mount_point); + } + } + + Ok(mount_paths) +} + +pub fn get_subsystem_mount_point(subsystem: &ControllerType) -> Result { + let subsystem = subsystem.to_string(); + Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| { + if m.fs_type == "cgroup" { + // Some systems mount net_prio and net_cls in the same directory + // other systems mount them in their own diretories. This + // should handle both cases. + if subsystem == "net_cls" { + return m.mount_point.ends_with("net_cls,net_prio") + || m.mount_point.ends_with("net_prio,net_cls") + || m.mount_point.ends_with("net_cls"); + } else if subsystem == "net_prio" { + return m.mount_point.ends_with("net_cls,net_prio") + || m.mount_point.ends_with("net_prio,net_cls") + || m.mount_point.ends_with("net_prio"); + } + + if subsystem == "cpu" { + return m.mount_point.ends_with("cpu,cpuacct") + || m.mount_point.ends_with("cpu"); + } + if subsystem == "cpuacct" { + return m.mount_point.ends_with("cpu,cpuacct") + || m.mount_point.ends_with("cpuacct"); + } + } + m.mount_point.ends_with(&subsystem) + }) + .map(|m| m.mount_point) + .ok_or_else(|| anyhow!("could not find mountpoint for {}", subsystem)) +} diff --git a/src/cgroups/v2/controller.rs b/src/cgroups/v2/controller.rs new file mode 100644 index 0000000000..e8751f92e5 --- /dev/null +++ b/src/cgroups/v2/controller.rs @@ -0,0 +1,8 @@ +use anyhow::Result; +use std::path::Path; + +use oci_spec::LinuxResources; + +pub trait Controller { + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()>; +} diff --git a/src/cgroups/controller_type.rs b/src/cgroups/v2/controller_type.rs similarity index 50% rename from src/cgroups/controller_type.rs rename to src/cgroups/v2/controller_type.rs index 933c593b17..c5cfb72012 100644 --- a/src/cgroups/controller_type.rs +++ b/src/cgroups/v2/controller_type.rs @@ -1,25 +1,23 @@ -use std::string::ToString; - pub enum ControllerType { - Devices, + Cpu, + CpuSet, + Io, + Memory, HugeTlb, Pids, - Memory, - Blkio, - NetworkPriority, - NetworkClassifier, + Freezer, } impl ToString for ControllerType { fn to_string(&self) -> String { match self { - Self::Devices => "devices".into(), + Self::Cpu => "cpu".into(), + Self::CpuSet => "cpuset".into(), + Self::Io => "io".into(), + Self::Memory => "memory".into(), Self::HugeTlb => "hugetlb".into(), Self::Pids => "pids".into(), - Self::Memory => "memory".into(), - Self::Blkio => "blkio".into(), - Self::NetworkPriority => "net_prio".into(), - Self::NetworkClassifier => "net_cls".into(), + Self::Freezer => "freezer".into(), } } } diff --git a/src/cgroups/v2/cpu.rs b/src/cgroups/v2/cpu.rs new file mode 100644 index 0000000000..184be42ba0 --- /dev/null +++ b/src/cgroups/v2/cpu.rs @@ -0,0 +1,231 @@ +use anyhow::{bail, Result}; +use std::path::Path; + +use crate::cgroups::common; +use oci_spec::{LinuxCpu, LinuxResources}; + +use super::controller::Controller; + +const CGROUP_CPU_WEIGHT: &str = "cpu.weight"; +const CGROUP_CPU_MAX: &str = "cpu.max"; +const DEFAULT_PERIOD: &str = "100000"; +const UNRESTRICTED_QUOTA: &str = "max"; + +pub struct Cpu {} + +impl Controller for Cpu { + fn apply(linux_resources: &LinuxResources, path: &Path) -> Result<()> { + if let Some(cpu) = &linux_resources.cpu { + Self::apply(path, cpu)?; + } + + Ok(()) + } +} + +impl Cpu { + fn apply(path: &Path, cpu: &LinuxCpu) -> Result<()> { + if Self::is_realtime_requested(cpu) { + bail!("realtime is not supported on cgroup v2 yet"); + } + + if let Some(mut shares) = cpu.shares { + shares = Self::convert_shares_to_cgroup2(shares); + if shares != 0 { + // will result in Erno 34 (numerical result out of range) otherwise + common::write_cgroup_file(path.join(CGROUP_CPU_WEIGHT), shares)?; + } + } + + // if quota is unrestricted set to 'max' + let mut quota_string = UNRESTRICTED_QUOTA.to_owned(); + if let Some(quota) = cpu.quota { + if quota > 0 { + quota_string = quota.to_string(); + } + } + + let mut period_string: String = DEFAULT_PERIOD.to_owned(); + if let Some(period) = cpu.period { + if period > 0 { + period_string = period.to_string(); + } + } + + // format is 'quota period' + // the kernel default is 'max 100000' + // 250000 250000 -> 1 CPU worth of runtime every 250ms + // 10000 50000 -> 20% of one CPU every 50ms + let max = quota_string + " " + &period_string; + common::write_cgroup_file_str(path.join(CGROUP_CPU_MAX), &max)?; + + Ok(()) + } + + fn convert_shares_to_cgroup2(shares: u64) -> u64 { + if shares == 0 { + return 0; + } + + 1 + ((shares - 2) * 9999) / 262142 + } + + fn is_realtime_requested(cpu: &LinuxCpu) -> bool { + if cpu.realtime_period.is_some() { + return true; + } + + if cpu.realtime_runtime.is_some() { + return true; + } + + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::{set_fixture, setup, LinuxCpuBuilder}; + use crate::utils::create_temp_dir; + use std::fs; + + #[test] + fn test_set_shares() { + // arrange + let (tmp, weight) = setup("test_set_shares", CGROUP_CPU_WEIGHT); + let _ = set_fixture(&tmp, CGROUP_CPU_MAX, "") + .unwrap_or_else(|_| panic!("set test fixture for {}", CGROUP_CPU_MAX)); + let cpu = LinuxCpuBuilder::new().with_shares(22000).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(weight) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_WEIGHT)); + assert_eq!(content, 840.to_string()); + } + + #[test] + fn test_set_positive_quota() { + // arrange + const QUOTA: i64 = 200000; + let (tmp, max) = setup("test_set_positive_quota", CGROUP_CPU_MAX); + let cpu = LinuxCpuBuilder::new().with_quota(QUOTA).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_MAX)); + assert_eq!(content, format!("{} {}", QUOTA, DEFAULT_PERIOD)) + } + + #[test] + fn test_set_zero_quota() { + // arrange + let (tmp, max) = setup("test_set_zero_quota", CGROUP_CPU_MAX); + let cpu = LinuxCpuBuilder::new().with_quota(0).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_MAX)); + assert_eq!( + content, + format!("{} {}", UNRESTRICTED_QUOTA, DEFAULT_PERIOD) + ) + } + + #[test] + fn test_set_positive_period() { + // arrange + const PERIOD: u64 = 100000; + let (tmp, max) = setup("test_set_positive_period", CGROUP_CPU_MAX); + let cpu = LinuxCpuBuilder::new().with_period(PERIOD).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_MAX)); + assert_eq!(content, format!("{} {}", UNRESTRICTED_QUOTA, PERIOD)) + } + + #[test] + fn test_set_zero_period() { + // arrange + let (tmp, max) = setup("test_set_zero_period", CGROUP_CPU_MAX); + let cpu = LinuxCpuBuilder::new().with_period(0).build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_MAX)); + assert_eq!( + content, + format!("{} {}", UNRESTRICTED_QUOTA, DEFAULT_PERIOD) + ); + } + + #[test] + fn test_set_quota_and_period() { + // arrange + const QUOTA: i64 = 200000; + const PERIOD: u64 = 100000; + let (tmp, max) = setup("test_set_quota_and_period", CGROUP_CPU_MAX); + let cpu = LinuxCpuBuilder::new() + .with_quota(QUOTA) + .with_period(PERIOD) + .build(); + + // act + Cpu::apply(&tmp, &cpu).expect("apply cpu"); + + // assert + let content = fs::read_to_string(max) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPU_MAX)); + assert_eq!(content, format!("{} {}", QUOTA, PERIOD)); + } + + #[test] + fn test_realtime_runtime_not_supported() { + // arrange + let tmp = create_temp_dir("test_realtime_runtime_not_supported") + .expect("create temp directory for test"); + let cpu = LinuxCpuBuilder::new().with_realtime_runtime(5).build(); + + // act + let result = Cpu::apply(&tmp, &cpu); + + // assert + assert!( + result.is_err(), + "realtime runtime is not supported and should return an error" + ); + } + + #[test] + fn test_realtime_period_not_supported() { + // arrange + let tmp = create_temp_dir("test_realtime_period_not_supported") + .expect("create temp directory for test"); + let cpu = LinuxCpuBuilder::new().with_realtime_period(5).build(); + + // act + let result = Cpu::apply(&tmp, &cpu); + + // assert + assert!( + result.is_err(), + "realtime period is not supported and should return an error" + ); + } +} diff --git a/src/cgroups/v2/cpuset.rs b/src/cgroups/v2/cpuset.rs new file mode 100644 index 0000000000..b477080b22 --- /dev/null +++ b/src/cgroups/v2/cpuset.rs @@ -0,0 +1,74 @@ +use anyhow::Result; +use std::path::Path; + +use crate::cgroups::common; +use oci_spec::{LinuxCpu, LinuxResources}; + +use super::controller::Controller; + +const CGROUP_CPUSET_CPUS: &str = "cpuset.cpus"; +const CGROUP_CPUSET_MEMS: &str = "cpuset.mems"; + +pub struct CpuSet {} + +impl Controller for CpuSet { + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + if let Some(cpuset) = &linux_resources.cpu { + Self::apply(cgroup_path, cpuset)?; + } + + Ok(()) + } +} + +impl CpuSet { + fn apply(path: &Path, cpuset: &LinuxCpu) -> Result<()> { + if let Some(cpus) = &cpuset.cpus { + common::write_cgroup_file_str(path.join(CGROUP_CPUSET_CPUS), cpus)?; + } + + if let Some(mems) = &cpuset.mems { + common::write_cgroup_file_str(path.join(CGROUP_CPUSET_MEMS), mems)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::*; + use crate::cgroups::test::{setup, LinuxCpuBuilder}; + + #[test] + fn test_set_cpus() { + // arrange + let (tmp, cpus) = setup("test_set_cpus", CGROUP_CPUSET_CPUS); + let cpuset = LinuxCpuBuilder::new().with_cpus("1-3".to_owned()).build(); + + // act + CpuSet::apply(&tmp, &cpuset).expect("apply cpuset"); + + // assert + let content = fs::read_to_string(&cpus) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPUSET_CPUS)); + assert_eq!(content, "1-3"); + } + + #[test] + fn test_set_mems() { + // arrange + let (tmp, mems) = setup("test_set_mems", CGROUP_CPUSET_MEMS); + let cpuset = LinuxCpuBuilder::new().with_mems("1-3".to_owned()).build(); + + // act + CpuSet::apply(&tmp, &cpuset).expect("apply cpuset"); + + // assert + let content = fs::read_to_string(&mems) + .unwrap_or_else(|_| panic!("read {} file content", CGROUP_CPUSET_MEMS)); + assert_eq!(content, "1-3"); + } +} diff --git a/src/cgroups/v2/freezer.rs b/src/cgroups/v2/freezer.rs new file mode 100644 index 0000000000..5827f813d0 --- /dev/null +++ b/src/cgroups/v2/freezer.rs @@ -0,0 +1,193 @@ +use anyhow::{bail, Result}; +use std::{ + fs::OpenOptions, + io::{BufRead, BufReader, Read, Seek, SeekFrom, Write}, + path::Path, + str, thread, + time::Duration, +}; + +use oci_spec::{FreezerState, LinuxResources}; + +use super::controller::Controller; + +const CGROUP_FREEZE: &str = "cgroup.freeze"; +const CGROUP_EVENTS: &str = "cgroup.events"; + +pub struct Freezer {} + +impl Controller for Freezer { + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + if let Some(freezer_state) = linux_resources.freezer { + Self::apply(freezer_state, cgroup_path)?; + } + + Ok(()) + } +} + +impl Freezer { + fn apply(freezer_state: FreezerState, path: &Path) -> Result<()> { + let state_str = match freezer_state { + FreezerState::Undefined => return Ok(()), + FreezerState::Frozen => "1", + FreezerState::Thawed => "0", + }; + + match OpenOptions::new() + .create(false) + .write(true) + .open(path.join(CGROUP_FREEZE)) + { + Err(e) => { + if let FreezerState::Frozen = freezer_state { + bail!("freezer not supported {}", e); + } + return Ok(()); + } + Ok(mut file) => file.write_all(state_str.as_bytes())?, + }; + + // confirm that the cgroup did actually change states. + let actual_state = Self::read_freezer_state(path)?; + if !actual_state.eq(&freezer_state) { + bail!( + "expected \"cgroup.freeze\" to be in state {:?} but was in {:?}", + freezer_state, + actual_state + ); + } + + Ok(()) + } + + fn read_freezer_state(path: &Path) -> Result { + let mut buf = [0; 1]; + OpenOptions::new() + .create(false) + .read(true) + .open(path.join(CGROUP_FREEZE))? + .read_exact(&mut buf)?; + + let state = str::from_utf8(&buf)?; + match state { + "0" => Ok(FreezerState::Thawed), + "1" => Self::wait_frozen(path), + _ => bail!("unknown \"cgroup.freeze\" state: {}", state), + } + } + + // wait_frozen polls cgroup.events until it sees "frozen 1" in it. + fn wait_frozen(path: &Path) -> Result { + let f = OpenOptions::new() + .create(false) + .read(true) + .open(path.join(CGROUP_EVENTS))?; + let mut f = BufReader::new(f); + + let wait_time = Duration::from_millis(10); + let max_iter = 1000; + let mut iter = 0; + let mut line = String::new(); + + loop { + if iter == max_iter { + bail!( + "timeout of {} ms reached waiting for the cgroup to freeze", + wait_time.as_millis() * max_iter + ); + } + line.clear(); + let num_bytes = f.read_line(&mut line)?; + if num_bytes == 0 { + break; + } + if line.starts_with("frozen ") { + if line.starts_with("frozen 1") { + if iter > 1 { + log::debug!("frozen after {} retries", iter) + } + return Ok(FreezerState::Frozen); + } + iter += 1; + thread::sleep(wait_time); + f.seek(SeekFrom::Start(0))?; + } + } + + Ok(FreezerState::Undefined) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::FreezerState; + use std::sync::Arc; + + #[test] + fn test_set_freezer_state() { + let tmp = Arc::new( + create_temp_dir("test_set_freezer_state").expect("create temp directory for test"), + ); + set_fixture(&tmp, CGROUP_FREEZE, "").expect("Set fixure for freezer state"); + set_fixture(&tmp, CGROUP_EVENTS, "populated 0\nfrozen 0") + .expect("Set fixure for freezer state"); + + // set Frozen state. + { + // use another thread to update events file async. + let p = Arc::clone(&tmp); + thread::spawn(move || { + thread::sleep(Duration::from_millis(100)); + set_fixture(&p, CGROUP_EVENTS, "populated 0\nfrozen 1") + .expect("Set fixure for freezer state"); + }); + let freezer_state = FreezerState::Frozen; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!("1", state_content); + } + + // set Thawed state. + { + let freezer_state = FreezerState::Thawed; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!("0", state_content); + } + + // set Undefined state. + { + let old_state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + let freezer_state = FreezerState::Undefined; + Freezer::apply(freezer_state, &tmp).expect("Set freezer state"); + + let state_content = + std::fs::read_to_string(tmp.join(CGROUP_FREEZE)).expect("Read to string"); + assert_eq!(old_state_content, state_content); + } + } + + #[test] + fn test_set_freezer_state_error() { + let tmp = create_temp_dir("test_set_freezer_state_error") + .expect("create temp directory for test"); + set_fixture(&tmp, CGROUP_FREEZE, "").expect("Set fixure for freezer state"); + set_fixture(&tmp, CGROUP_EVENTS, "").expect("Set fixure for freezer state"); + + // events file does not contain "frozen 1" + { + let freezer_state = FreezerState::Frozen; + let r = Freezer::apply(freezer_state, &tmp); + assert!(r.is_err()); + } + } +} diff --git a/src/cgroups/v2/hugetlb.rs b/src/cgroups/v2/hugetlb.rs new file mode 100644 index 0000000000..fe0cf9dab6 --- /dev/null +++ b/src/cgroups/v2/hugetlb.rs @@ -0,0 +1,117 @@ +use anyhow::{bail, Result}; +use std::path::Path; + +use super::controller::Controller; +use crate::cgroups::common; +use oci_spec::{LinuxHugepageLimit, LinuxResources}; + +pub struct HugeTlb {} + +impl Controller for HugeTlb { + fn apply(linux_resources: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { + log::debug!("Apply hugetlb cgroup v2 config"); + if let Some(hugepage_limits) = Self::needs_to_handle(linux_resources) { + for hugetlb in hugepage_limits { + Self::apply(cgroup_root, hugetlb)? + } + } + Ok(()) + } +} + +impl HugeTlb { + fn apply(root_path: &Path, hugetlb: &LinuxHugepageLimit) -> Result<()> { + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse()?; + if !Self::is_power_of_two(page_size) { + bail!("page size must be in the format of 2^(integer)"); + } + + common::write_cgroup_file( + root_path.join(format!("hugetlb.{}.limit_in_bytes", hugetlb.page_size)), + hugetlb.limit, + )?; + Ok(()) + } + + fn needs_to_handle(linux_resources: &LinuxResources) -> Option<&Vec> { + if !linux_resources.hugepage_limits.is_empty() { + return Some(&linux_resources.hugepage_limits); + } + + None + } + + fn is_power_of_two(number: u64) -> bool { + (number != 0) && (number & (number - 1)) == 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxHugepageLimit; + use std::fs::read_to_string; + + #[test] + fn test_set_hugetlb() { + let page_file_name = "hugetlb.2MB.limit_in_bytes"; + let tmp = create_temp_dir("test_set_hugetlbv2").expect("create temp directory for test"); + set_fixture(&tmp, page_file_name, "0").expect("Set fixture for 2 MB page size"); + + let hugetlb = LinuxHugepageLimit { + page_size: "2MB".to_owned(), + limit: 16384, + }; + HugeTlb::apply(&tmp, &hugetlb).expect("apply hugetlb"); + let content = read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + assert_eq!(hugetlb.limit.to_string(), content); + } + + #[test] + fn test_set_hugetlb_with_invalid_page_size() { + let tmp = create_temp_dir("test_set_hugetlbv2_with_invalid_page_size") + .expect("create temp directory for test"); + + let hugetlb = LinuxHugepageLimit { + page_size: "3MB".to_owned(), + limit: 16384, + }; + + let result = HugeTlb::apply(&tmp, &hugetlb); + assert!( + result.is_err(), + "page size that is not a power of two should be an error" + ); + } + + quickcheck! { + fn property_test_set_hugetlb(hugetlb: LinuxHugepageLimit) -> bool { + let page_file_name = format!("hugetlb.{:?}.limit_in_bytes", hugetlb.page_size); + let tmp = create_temp_dir("property_test_set_hugetlbv2").expect("create temp directory for test"); + set_fixture(&tmp, &page_file_name, "0").expect("Set fixture for page size"); + let result = HugeTlb::apply(&tmp, &hugetlb); + + let page_size: String = hugetlb + .page_size + .chars() + .take_while(|c| c.is_digit(10)) + .collect(); + let page_size: u64 = page_size.parse().expect("parse page size"); + + if HugeTlb::is_power_of_two(page_size) && page_size != 1 { + let content = + read_to_string(tmp.join(page_file_name)).expect("Read hugetlb file content"); + hugetlb.limit.to_string() == content + } else { + result.is_err() + } + } + } +} diff --git a/src/cgroups/v2/io.rs b/src/cgroups/v2/io.rs new file mode 100644 index 0000000000..67ef510ab9 --- /dev/null +++ b/src/cgroups/v2/io.rs @@ -0,0 +1,237 @@ +use std::path::{Path, PathBuf}; + +use anyhow::{bail, Result}; + +use crate::cgroups::common; + +use super::controller::Controller; +use oci_spec::{LinuxBlockIo, LinuxResources}; + +const CGROUP_BFQ_IO_WEIGHT: &str = "io.bfq.weight"; +const CGROUP_IO_WEIGHT: &str = "io.weight"; + +pub struct Io {} + +impl Controller for Io { + fn apply(linux_resource: &LinuxResources, cgroup_root: &Path) -> Result<()> { + log::debug!("Apply io cgrup v2 config"); + if let Some(io) = &linux_resource.block_io { + Self::apply(cgroup_root, io)?; + } + Ok(()) + } +} + +impl Io { + fn io_max_path(path: &Path) -> PathBuf { + path.join("io.max") + } + + // linux kernel doc: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#io + fn apply(root_path: &Path, blkio: &LinuxBlockIo) -> Result<()> { + for wd in &blkio.blkio_weight_device { + common::write_cgroup_file( + root_path.join(CGROUP_BFQ_IO_WEIGHT), + &format!("{}:{} {}", wd.major, wd.minor, wd.weight.unwrap()), + )?; + } + if let Some(leaf_weight) = blkio.blkio_leaf_weight { + if leaf_weight > 0 { + bail!("cannot set leaf_weight with cgroupv2"); + } + } + if let Some(io_weight) = blkio.blkio_weight { + if io_weight > 0 { + common::write_cgroup_file( + root_path.join(CGROUP_IO_WEIGHT), + format!("{}", io_weight), + )?; + } + } + + for trbd in &blkio.blkio_throttle_read_bps_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + &format!("{}:{} rbps={}", trbd.major, trbd.minor, trbd.rate), + )?; + } + + for twbd in &blkio.blkio_throttle_write_bps_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} wbps={}", twbd.major, twbd.minor, twbd.rate), + )?; + } + for trid in &blkio.blkio_throttle_read_iops_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} riops={}", trid.major, trid.minor, trid.rate), + )?; + } + for twid in &blkio.blkio_throttle_write_iops_device { + common::write_cgroup_file( + Self::io_max_path(root_path), + format!("{}:{} wiops={}", twid.major, twid.minor, twid.rate), + )?; + } + Ok(()) + } +} +#[cfg(test)] +mod test { + use super::*; + use crate::cgroups::test::setup; + use oci_spec::{LinuxBlockIo, LinuxThrottleDevice, LinuxWeightDevice}; + use std::fs; + struct BlockIoBuilder { + block_io: LinuxBlockIo, + } + impl BlockIoBuilder { + fn new() -> Self { + let block_io = LinuxBlockIo { + blkio_weight: Some(0), + blkio_leaf_weight: Some(0), + blkio_weight_device: vec![], + blkio_throttle_read_bps_device: vec![], + blkio_throttle_write_bps_device: vec![], + blkio_throttle_read_iops_device: vec![], + blkio_throttle_write_iops_device: vec![], + }; + + Self { block_io } + } + fn with_write_weight_device(mut self, throttle: Vec) -> Self { + self.block_io.blkio_weight_device = throttle; + self + } + fn with_write_io_weight(mut self, iow: u16) -> Self { + self.block_io.blkio_weight = Some(iow); + self + } + + fn with_read_bps(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_read_bps_device = throttle; + self + } + + fn with_write_bps(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_write_bps_device = throttle; + self + } + + fn with_read_iops(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_read_iops_device = throttle; + self + } + + fn with_write_iops(mut self, throttle: Vec) -> Self { + self.block_io.blkio_throttle_write_iops_device = throttle; + self + } + + fn build(self) -> LinuxBlockIo { + self.block_io + } + } + + #[test] + fn test_set_io_read_bps() { + let (tmp, throttle) = setup("test_set_io_read_bps", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_read_bps(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read rbps content")); + + assert_eq!("8:0 rbps=102400", content); + } + + #[test] + fn test_set_io_write_bps() { + let (tmp, throttle) = setup("test_set_io_write_bps", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_write_bps(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read rbps content")); + + assert_eq!("8:0 wbps=102400", content); + } + + #[test] + fn test_set_io_read_iops() { + let (tmp, throttle) = setup("test_set_io_read_iops", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_read_iops(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read riops content")); + + assert_eq!("8:0 riops=102400", content); + } + + #[test] + fn test_set_io_write_iops() { + let (tmp, throttle) = setup("test_set_io_write_iops", "io.max"); + + let blkio = BlockIoBuilder::new() + .with_write_iops(vec![LinuxThrottleDevice { + major: 8, + minor: 0, + rate: 102400, + }]) + .build(); + + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read wiops content")); + + assert_eq!("8:0 wiops=102400", content); + } + + #[test] + fn test_set_ioweight_device() { + let (tmp, throttle) = setup("test_set_io_weight_device", CGROUP_BFQ_IO_WEIGHT); + let blkio = BlockIoBuilder::new() + .with_write_weight_device(vec![LinuxWeightDevice { + major: 8, + minor: 0, + weight: Some(80), + leaf_weight: Some(0), + }]) + .build(); + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = + fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read bfq_io_weight content")); + + assert_eq!("8:0 80", content); + } + + #[test] + fn test_set_ioweight() { + let (tmp, throttle) = setup("test_set_io_weight", CGROUP_IO_WEIGHT); + let blkio = BlockIoBuilder::new().with_write_io_weight(100).build(); + Io::apply(&tmp, &blkio).expect("apply blkio"); + let content = + fs::read_to_string(throttle).unwrap_or_else(|_| panic!("read bfq_io_weight content")); + + assert_eq!("100", content); + } +} diff --git a/src/cgroups/v2/manager.rs b/src/cgroups/v2/manager.rs new file mode 100644 index 0000000000..92da1a37f8 --- /dev/null +++ b/src/cgroups/v2/manager.rs @@ -0,0 +1,157 @@ +use std::{ + fs::{self}, + os::unix::fs::PermissionsExt, + path::{Path, PathBuf}, +}; + +use anyhow::{bail, Result}; + +use nix::unistd::Pid; +use oci_spec::{FreezerState, LinuxResources}; + +use super::{ + cpu::Cpu, cpuset::CpuSet, freezer::Freezer, hugetlb::HugeTlb, io::Io, memory::Memory, + pids::Pids, +}; +use crate::{ + cgroups::v2::controller::Controller, + cgroups::{ + common::{self, CgroupManager, CGROUP_PROCS}, + v2::controller_type::ControllerType, + }, + utils::PathBufExt, +}; + +const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; +const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; + +const CONTROLLER_TYPES: &[ControllerType] = &[ + ControllerType::Cpu, + ControllerType::CpuSet, + ControllerType::HugeTlb, + ControllerType::Io, + ControllerType::Memory, + ControllerType::Pids, + ControllerType::Freezer, +]; + +pub struct Manager { + root_path: PathBuf, + cgroup_path: PathBuf, + full_path: PathBuf, +} + +impl Manager { + /// Constructs a new cgroup manager with root path being the mount point + /// of a cgroup v2 fs and cgroup path being a relative path from the root + pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result { + let full_path = root_path.join_absolute_path(&cgroup_path)?; + + Ok(Self { + root_path, + cgroup_path, + full_path, + }) + } + + fn create_unified_cgroup(&self, pid: Pid) -> Result<()> { + let controllers: Vec = self + .get_available_controllers()? + .iter() + .map(|c| format!("{}{}", "+", c.to_string())) + .collect(); + + Self::write_controllers(&self.root_path, &controllers)?; + + let mut current_path = self.root_path.clone(); + let mut components = self.cgroup_path.components().skip(1).peekable(); + while let Some(component) = components.next() { + current_path = current_path.join(component); + if !current_path.exists() { + fs::create_dir(¤t_path)?; + fs::metadata(¤t_path)?.permissions().set_mode(0o755); + } + + // last component cannot have subtree_control enabled due to internal process constraint + // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy) + if components.peek().is_some() { + Self::write_controllers(¤t_path, &controllers)?; + } + } + + common::write_cgroup_file(&self.full_path.join(CGROUP_PROCS), pid)?; + Ok(()) + } + + fn get_available_controllers(&self) -> Result> { + let controllers_path = self.root_path.join(CGROUP_CONTROLLERS); + if !controllers_path.exists() { + bail!( + "cannot get available controllers. {:?} does not exist", + controllers_path + ) + } + + let mut controllers = Vec::new(); + for controller in fs::read_to_string(&controllers_path)?.split_whitespace() { + match controller { + "cpu" => controllers.push(ControllerType::Cpu), + "cpuset" => controllers.push(ControllerType::CpuSet), + "hugetlb" => controllers.push(ControllerType::HugeTlb), + "io" => controllers.push(ControllerType::Io), + "memory" => controllers.push(ControllerType::Memory), + "pids" => controllers.push(ControllerType::Pids), + "freezer" => controllers.push(ControllerType::Freezer), + tpe => log::warn!("Controller {} is not yet implemented.", tpe), + } + } + + Ok(controllers) + } + + fn write_controllers(path: &Path, controllers: &[String]) -> Result<()> { + for controller in controllers { + common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?; + } + + Ok(()) + } +} + +impl CgroupManager for Manager { + fn add_task(&self, pid: Pid) -> Result<()> { + self.create_unified_cgroup(pid)?; + Ok(()) + } + + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { + for controller in CONTROLLER_TYPES { + match controller { + ControllerType::Cpu => Cpu::apply(linux_resources, &self.full_path)?, + ControllerType::CpuSet => CpuSet::apply(linux_resources, &self.full_path)?, + ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &self.full_path)?, + ControllerType::Io => Io::apply(linux_resources, &self.full_path)?, + ControllerType::Memory => Memory::apply(linux_resources, &self.full_path)?, + ControllerType::Pids => Pids::apply(linux_resources, &self.full_path)?, + ControllerType::Freezer => Freezer::apply(linux_resources, &self.full_path)?, + } + } + + Ok(()) + } + + fn remove(&self) -> Result<()> { + log::debug!("remove cgroup {:?}", self.full_path); + fs::remove_dir_all(&self.full_path)?; + + Ok(()) + } + + fn freeze(&self, state: FreezerState) -> Result<()> { + let linux_resources = LinuxResources { + freezer: Some(state), + ..Default::default() + }; + Freezer::apply(&linux_resources, &self.full_path) + } +} diff --git a/src/cgroups/v2/memory.rs b/src/cgroups/v2/memory.rs new file mode 100644 index 0000000000..a83f7391f8 --- /dev/null +++ b/src/cgroups/v2/memory.rs @@ -0,0 +1,24 @@ +use anyhow::Result; +use std::path::Path; + +use oci_spec::{LinuxMemory, LinuxResources}; + +use super::controller::Controller; + +pub struct Memory {} + +impl Controller for Memory { + fn apply(linux_resources: &LinuxResources, cgroup_path: &Path) -> Result<()> { + if let Some(memory) = &linux_resources.memory { + Self::apply(cgroup_path, memory)?; + } + + Ok(()) + } +} + +impl Memory { + fn apply(_: &Path, _: &LinuxMemory) -> Result<()> { + Ok(()) + } +} diff --git a/src/cgroups/v2/mod.rs b/src/cgroups/v2/mod.rs new file mode 100644 index 0000000000..f86f1b8a0a --- /dev/null +++ b/src/cgroups/v2/mod.rs @@ -0,0 +1,13 @@ +mod controller; +mod controller_type; +mod cpu; +mod cpuset; +mod freezer; +mod hugetlb; +mod io; +pub mod manager; +mod memory; +mod pids; +pub mod systemd_manager; +pub mod util; +pub use systemd_manager::SystemDCGroupManager; diff --git a/src/cgroups/v2/pids.rs b/src/cgroups/v2/pids.rs new file mode 100644 index 0000000000..9c7faf171f --- /dev/null +++ b/src/cgroups/v2/pids.rs @@ -0,0 +1,68 @@ +use std::path::Path; + +use anyhow::Result; + +use crate::cgroups::common; + +use super::controller::Controller; +use oci_spec::{LinuxPids, LinuxResources}; + +pub struct Pids {} + +impl Controller for Pids { + fn apply(linux_resource: &LinuxResources, cgroup_root: &std::path::Path) -> Result<()> { + log::debug!("Apply pids cgroup v2 config"); + if let Some(pids) = &linux_resource.pids { + Self::apply(cgroup_root, pids)?; + } + Ok(()) + } +} + +impl Pids { + fn apply(root_path: &Path, pids: &LinuxPids) -> Result<()> { + let limit = if pids.limit > 0 { + pids.limit.to_string() + } else { + "max".to_string() + }; + common::write_cgroup_file(&root_path.join("pids.max"), &limit) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cgroups::test::set_fixture; + use crate::utils::create_temp_dir; + use oci_spec::LinuxPids; + + #[test] + fn test_set_pids() { + let pids_file_name = "pids.max"; + let tmp = create_temp_dir("v2_test_set_pids").expect("create temp directory for test"); + set_fixture(&tmp, pids_file_name, "1000").expect("Set fixture for 1000 pids"); + + let pids = LinuxPids { limit: 1000 }; + + Pids::apply(&tmp, &pids).expect("apply pids"); + let content = + std::fs::read_to_string(tmp.join(pids_file_name)).expect("Read pids contents"); + assert_eq!(pids.limit.to_string(), content); + } + + #[test] + fn test_set_pids_max() { + let pids_file_name = "pids.max"; + let tmp = create_temp_dir("v2_test_set_pids_max").expect("create temp directory for test"); + set_fixture(&tmp, pids_file_name, "0").expect("set fixture for 0 pids"); + + let pids = LinuxPids { limit: 0 }; + + Pids::apply(&tmp, &pids).expect("apply pids"); + + let content = + std::fs::read_to_string(tmp.join(pids_file_name)).expect("Read pids contents"); + assert_eq!("max".to_string(), content); + } +} diff --git a/src/cgroups/v2/systemd_manager.rs b/src/cgroups/v2/systemd_manager.rs new file mode 100644 index 0000000000..998424a3c7 --- /dev/null +++ b/src/cgroups/v2/systemd_manager.rs @@ -0,0 +1,316 @@ +use std::{ + fs::{self}, + os::unix::fs::PermissionsExt, +}; + +use anyhow::{anyhow, bail, Result}; +use nix::unistd::Pid; +use oci_spec::{FreezerState, LinuxResources}; +use std::path::{Path, PathBuf}; + +use super::{ + cpu::Cpu, cpuset::CpuSet, freezer::Freezer, hugetlb::HugeTlb, io::Io, memory::Memory, + pids::Pids, +}; +use crate::cgroups::common; +use crate::cgroups::common::CgroupManager; +use crate::cgroups::v2::controller::Controller; +use crate::cgroups::v2::controller_type::ControllerType; +use crate::utils::PathBufExt; + +const CGROUP_PROCS: &str = "cgroup.procs"; +const CGROUP_CONTROLLERS: &str = "cgroup.controllers"; +const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; + +// v2 systemd only supports cpu, io, memory and pids. +const CONTROLLER_TYPES: &[ControllerType] = &[ + ControllerType::Cpu, + ControllerType::Io, + ControllerType::Memory, + ControllerType::Pids, +]; + +/// SystemDCGroupManager is a driver for managing cgroups via systemd. +pub struct SystemDCGroupManager { + root_path: PathBuf, + cgroups_path: PathBuf, + full_path: PathBuf, +} + +/// Represents the systemd cgroups path: +/// It should be of the form [slice]:[scope_prefix]:[name]. +/// The slice is the "parent" and should be expanded properly, +/// see expand_slice below. +struct CgroupsPath { + parent: String, + scope: String, + name: String, +} + +impl SystemDCGroupManager { + pub fn new(root_path: PathBuf, cgroups_path: PathBuf) -> Result { + // TODO: create the systemd unit using a dbus client. + let destructured_path = Self::destructure_cgroups_path(cgroups_path)?; + let cgroups_path = Self::construct_cgroups_path(destructured_path)?; + let full_path = root_path.join_absolute_path(&cgroups_path)?; + + Ok(SystemDCGroupManager { + root_path, + cgroups_path, + full_path, + }) + } + + fn destructure_cgroups_path(cgroups_path: PathBuf) -> Result { + // cgroups path may never be empty as it is defaulted to `/youki` + // see 'get_cgroup_path' under utils.rs. + // if cgroups_path was provided it should be of the form [slice]:[scope_prefix]:[name], + // for example: "system.slice:docker:1234". + let mut parent = ""; + let scope; + let name; + if cgroups_path.starts_with("/youki") { + scope = "youki"; + name = cgroups_path + .strip_prefix("/youki/")? + .to_str() + .ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))?; + } else { + let parts = cgroups_path + .to_str() + .ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))? + .split(':') + .collect::>(); + parent = parts[0]; + scope = parts[1]; + name = parts[2]; + } + + Ok(CgroupsPath { + parent: parent.to_owned(), + scope: scope.to_owned(), + name: name.to_owned(), + }) + } + + /// get_unit_name returns the unit (scope) name from the path provided by the user + /// for example: foo:docker:bar returns in '/docker-bar.scope' + fn get_unit_name(cgroups_path: CgroupsPath) -> String { + // By default we create a scope unless specified explicitly. + if !cgroups_path.name.ends_with(".slice") { + return format!("{}-{}.scope", cgroups_path.scope, cgroups_path.name); + } + cgroups_path.name + } + + // systemd represents slice hierarchy using `-`, so we need to follow suit when + // generating the path of slice. For example, 'test-a-b.slice' becomes + // '/test.slice/test-a.slice/test-a-b.slice'. + fn expand_slice(slice: &str) -> Result { + let suffix = ".slice"; + if slice.len() <= suffix.len() || !slice.ends_with(suffix) { + bail!("invalid slice name: {}", slice); + } + if slice.contains('/') { + bail!("invalid slice name: {}", slice); + } + let mut path = "".to_owned(); + let mut prefix = "".to_owned(); + let slice_name = slice.trim_end_matches(suffix); + // if input was -.slice, we should just return root now + if slice_name == "-" { + return Ok(Path::new("/").to_path_buf()); + } + for component in slice_name.split('-') { + if component.is_empty() { + anyhow!("Invalid slice name: {}", slice); + } + // Append the component to the path and to the prefix. + path = format!("{}/{}{}{}", path, prefix, component, suffix); + prefix = format!("{}{}-", prefix, component); + } + Ok(Path::new(&path).to_path_buf()) + } + + // get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath. + // an example of the final path: "/machine.slice/docker-foo.scope" + fn construct_cgroups_path(cgroups_path: CgroupsPath) -> Result { + // the root slice is under 'machine.slice'. + let mut slice = Path::new("/machine.slice").to_path_buf(); + // if the user provided a '.slice' (as in a branch of a tree) + // we need to "unpack it". + if !cgroups_path.parent.is_empty() { + slice = Self::expand_slice(&cgroups_path.parent)?; + } + let unit_name = Self::get_unit_name(cgroups_path); + let cgroups_path = slice.join(unit_name); + Ok(cgroups_path) + } + + /// create_unified_cgroup verifies sure that *each level* in the downward path from the root cgroup + /// down to the cgroup_path provided by the user is a valid cgroup hierarchy, + /// containing the attached controllers and that it contains the container pid. + fn create_unified_cgroup(&self, pid: Pid) -> Result<()> { + let controllers: Vec = self + .get_available_controllers(&self.root_path)? + .into_iter() + .map(|c| format!("{}{}", "+", c.to_string())) + .collect(); + + // Write the controllers to the root_path. + Self::write_controllers(&self.root_path, &controllers)?; + + let mut current_path = self.root_path.clone(); + let mut components = self.cgroups_path.components().skip(1).peekable(); + // Verify that *each level* in the downward path from the root cgroup + // down to the cgroup_path provided by the user is a valid cgroup hierarchy. + // containing the attached controllers. + while let Some(component) = components.next() { + current_path = current_path.join(component); + if !current_path.exists() { + fs::create_dir(¤t_path)?; + fs::metadata(¤t_path)?.permissions().set_mode(0o755); + } + + // last component cannot have subtree_control enabled due to internal process constraint + // if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy) + if components.peek().is_some() { + Self::write_controllers(¤t_path, &controllers)?; + } + } + + common::write_cgroup_file(self.full_path.join(CGROUP_PROCS), pid) + } + + fn get_available_controllers>( + &self, + cgroups_path: P, + ) -> Result> { + let controllers_path = self.root_path.join(cgroups_path).join(CGROUP_CONTROLLERS); + if !controllers_path.exists() { + bail!( + "cannot get available controllers. {:?} does not exist", + controllers_path + ) + } + + let mut controllers = Vec::new(); + for controller in fs::read_to_string(&controllers_path)?.split_whitespace() { + match controller { + "cpu" => controllers.push(ControllerType::Cpu), + "io" => controllers.push(ControllerType::Io), + "memory" => controllers.push(ControllerType::Memory), + "pids" => controllers.push(ControllerType::Pids), + _ => continue, + } + } + + Ok(controllers) + } + + fn write_controllers(path: &Path, controllers: &[String]) -> Result<()> { + for controller in controllers { + common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?; + } + + Ok(()) + } +} + +impl CgroupManager for SystemDCGroupManager { + fn add_task(&self, pid: Pid) -> Result<()> { + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid.as_raw() == -1 { + return Ok(()); + } + + self.create_unified_cgroup(pid)?; + Ok(()) + } + + fn apply(&self, linux_resources: &LinuxResources) -> Result<()> { + for controller in CONTROLLER_TYPES { + match controller { + ControllerType::Cpu => Cpu::apply(linux_resources, &self.full_path)?, + ControllerType::CpuSet => CpuSet::apply(linux_resources, &self.full_path)?, + ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &self.full_path)?, + ControllerType::Io => Io::apply(linux_resources, &self.full_path)?, + ControllerType::Memory => Memory::apply(linux_resources, &self.full_path)?, + ControllerType::Pids => Pids::apply(linux_resources, &self.full_path)?, + ControllerType::Freezer => Freezer::apply(linux_resources, &self.full_path)?, + } + } + + Ok(()) + } + + fn remove(&self) -> Result<()> { + Ok(()) + } + + fn freeze(&self, state: FreezerState) -> Result<()> { + let linux_resources = LinuxResources { + freezer: Some(state), + ..Default::default() + }; + Freezer::apply(&linux_resources, &self.full_path) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn expand_slice_works() -> Result<()> { + assert_eq!( + SystemDCGroupManager::expand_slice("test-a-b.slice")?, + PathBuf::from("/test.slice/test-a.slice/test-a-b.slice"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> { + let cgroups_path = SystemDCGroupManager::destructure_cgroups_path(PathBuf::from( + "test-a-b.slice:docker:foo", + )) + .expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/test.slice/test-a.slice/test-a-b.slice/docker-foo.scope"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> { + let cgroups_path = SystemDCGroupManager::destructure_cgroups_path(PathBuf::from( + "machine.slice:libpod:foo", + )) + .expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/machine.slice/libpod-foo.scope"), + ); + + Ok(()) + } + + #[test] + fn get_cgroups_path_works_with_scope() -> Result<()> { + let cgroups_path = + SystemDCGroupManager::destructure_cgroups_path(PathBuf::from(":docker:foo")).expect(""); + + assert_eq!( + SystemDCGroupManager::construct_cgroups_path(cgroups_path)?, + PathBuf::from("/machine.slice/docker-foo.scope"), + ); + + Ok(()) + } +} diff --git a/src/cgroups/v2/util.rs b/src/cgroups/v2/util.rs new file mode 100644 index 0000000000..e8712b519d --- /dev/null +++ b/src/cgroups/v2/util.rs @@ -0,0 +1,13 @@ +use std::path::PathBuf; + +use anyhow::{anyhow, Result}; +use procfs::process::Process; + +pub fn get_unified_mount_point() -> Result { + Process::myself()? + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup2") + .map(|m| m.mount_point) + .ok_or_else(|| anyhow!("could not find mountpoint for unified")) +} diff --git a/src/command/linux.rs b/src/command/linux.rs index 0f570e33ec..cc055c1724 100644 --- a/src/command/linux.rs +++ b/src/command/linux.rs @@ -1,7 +1,12 @@ -use std::{any::Any, path::Path}; +//! Implements Command trait for Linux systems +use std::ffi::{CStr, OsStr}; +use std::os::unix::ffi::OsStrExt; +use std::sync::Arc; +use std::{any::Any, mem, path::Path, ptr}; use anyhow::{bail, Result}; -use caps::{errors::CapsError, CapSet, CapsHashSet}; +use caps::{errors::CapsError, CapSet, Capability, CapsHashSet}; +use libc::{c_char, uid_t}; use nix::{ errno::Errno, unistd::{fchdir, pivot_root, sethostname}, @@ -19,39 +24,78 @@ use nix::{sched::unshare, sys::stat::Mode}; use oci_spec::LinuxRlimit; -use super::Command; +use super::Syscall; use crate::capabilities; +/// Empty structure to implement Command trait for #[derive(Clone)] -pub struct LinuxCommand; +pub struct LinuxSyscall; -impl Command for LinuxCommand { +impl LinuxSyscall { + unsafe fn from_raw_buf<'a, T>(p: *const c_char) -> T + where + T: From<&'a OsStr>, + { + T::from(OsStr::from_bytes(CStr::from_ptr(p).to_bytes())) + } + + /// Reads data from the `c_passwd` and returns it as a `User`. + unsafe fn passwd_to_user(passwd: libc::passwd) -> Arc { + let name: Arc = Self::from_raw_buf(passwd.pw_name); + name + } +} + +impl Syscall for LinuxSyscall { + /// To enable dynamic typing, + /// see https://doc.rust-lang.org/std/any/index.html for more information fn as_any(&self) -> &dyn Any { self } + /// Function to set given path as root path inside process fn pivot_rootfs(&self, path: &Path) -> Result<()> { + // open the path as directory and read only let newroot = open(path, OFlag::O_DIRECTORY | OFlag::O_RDONLY, Mode::empty())?; + // make the given path as the root directory for the container + // see https://man7.org/linux/man-pages/man2/pivot_root.2.html, specially the notes + // pivot root usually changes the root directory to first argument, and then mounts the original root + // directory at second argument. Giving same path for both stacks mapping of the original root directory + // above the new directory at the same path, then the call to umount unmounts the original root directory from + // this path. This is done, as otherwise, we will need to create a separate temporary directory under the new root path + // so we can move the original root there, and then unmount that. This way saves the creation of the temporary + // directory to put original root directory. pivot_root(path, path)?; + // Unmount the original root directory which was stacked on top of new root directory + // MNT_DETACH makes the mount point unavailable to new accesses, but waits till the original mount point + // to be free of activity to actually unmount + // see https://man7.org/linux/man-pages/man2/umount2.2.html for more information umount2("/", MntFlags::MNT_DETACH)?; + // Change directory to root fchdir(newroot)?; Ok(()) } + /// Set namespace for process fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()> { nix::sched::setns(rawfd, nstype)?; Ok(()) } + /// set uid and gid for process fn set_id(&self, uid: Uid, gid: Gid) -> Result<()> { if let Err(e) = prctl::set_keep_capabilities(true) { bail!("set keep capabilities returned {}", e); }; + // args : real *id, effective *id, saved set *id respectively unistd::setresgid(gid, gid, gid)?; unistd::setresuid(uid, uid, uid)?; + // if not the root user, reset capabilities to effective capabilities, + // which are used by kernel to perform checks + // see https://man7.org/linux/man-pages/man7/capabilities.7.html for more information if uid != Uid::from_raw(0) { capabilities::reset_effective(self)?; } @@ -61,15 +105,43 @@ impl Command for LinuxCommand { Ok(()) } + /// Disassociate parts of execution context + // see https://man7.org/linux/man-pages/man2/unshare.2.html for more information fn unshare(&self, flags: CloneFlags) -> Result<()> { unshare(flags)?; Ok(()) } + /// Set capabilities for container process fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<(), CapsError> { - caps::set(None, cset, value) + match cset { + // caps::set cannot set capabilities in bounding set, + // so we do it differently + CapSet::Bounding => { + // get all capabilities + let all = caps::all(); + // the difference will give capabilities + // which are to be unset + // for each such =, drop that capability + // after this, only those which are to be set will remain set + for c in all.difference(value) { + match c { + Capability::CAP_PERFMON + | Capability::CAP_CHECKPOINT_RESTORE + | Capability::CAP_BPF => { + log::warn!("{:?} is not supported.", c); + continue; + } + _ => caps::drop(None, CapSet::Bounding, *c)?, + } + } + Ok(()) + } + _ => caps::set(None, cset, value), + } } + /// Sets hostname for process fn set_hostname(&self, hostname: &str) -> Result<()> { if let Err(e) = sethostname(hostname) { bail!("Failed to set {} as hostname. {:?}", hostname, e) @@ -77,6 +149,7 @@ impl Command for LinuxCommand { Ok(()) } + /// Sets resource limit for process fn set_rlimit(&self, rlimit: &LinuxRlimit) -> Result<()> { let rlim = &libc::rlimit { rlim_cur: rlimit.soft, @@ -88,4 +161,38 @@ impl Command for LinuxCommand { } Ok(()) } + + // taken from https://crates.io/crates/users + fn get_pwuid(&self, uid: uid_t) -> Option> { + let mut passwd = unsafe { mem::zeroed::() }; + let mut buf = vec![0; 2048]; + let mut result = ptr::null_mut::(); + + loop { + let r = unsafe { + libc::getpwuid_r(uid, &mut passwd, buf.as_mut_ptr(), buf.len(), &mut result) + }; + + if r != libc::ERANGE { + break; + } + + let newsize = buf.len().checked_mul(2)?; + buf.resize(newsize, 0); + } + + if result.is_null() { + // There is no such user, or an error has occurred. + // errno gets set if there’s an error. + return None; + } + + if result != &mut passwd { + // The result of getpwuid_r should be its input passwd. + return None; + } + + let user = unsafe { Self::passwd_to_user(result.read()) }; + Some(user) + } } diff --git a/src/command/mod.rs b/src/command/mod.rs index cf06f9de63..543997e8ec 100644 --- a/src/command/mod.rs +++ b/src/command/mod.rs @@ -1,6 +1,10 @@ -#[allow(clippy::module_inception)] -mod command; +//! Contains a wrapper of syscalls for unit tests +//! This provides a uniform interface for rest of Youki +//! to call syscalls required for container management + pub mod linux; +#[allow(clippy::module_inception)] +pub mod syscall; pub mod test; -pub use command::Command; +pub use syscall::Syscall; diff --git a/src/command/command.rs b/src/command/syscall.rs similarity index 51% rename from src/command/command.rs rename to src/command/syscall.rs index 517ed6aee1..f3bba8727b 100644 --- a/src/command/command.rs +++ b/src/command/syscall.rs @@ -1,4 +1,7 @@ -use std::{any::Any, path::Path}; +//! An interface trait so that rest of Youki can call +//! necessary functions without having to worry about their +//! implementation details +use std::{any::Any, ffi::OsStr, path::Path, sync::Arc}; use anyhow::Result; use caps::{errors::CapsError, CapSet, CapsHashSet}; @@ -9,7 +12,11 @@ use nix::{ use oci_spec::LinuxRlimit; -pub trait Command { +use crate::command::{linux::LinuxSyscall, test::TestHelperSyscall}; + +/// This specifies various kernel/other functionalities required for +/// container management +pub trait Syscall { fn as_any(&self) -> &dyn Any; fn pivot_rootfs(&self, path: &Path) -> Result<()>; fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()>; @@ -18,4 +25,13 @@ pub trait Command { fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<(), CapsError>; fn set_hostname(&self, hostname: &str) -> Result<()>; fn set_rlimit(&self, rlimit: &LinuxRlimit) -> Result<()>; + fn get_pwuid(&self, uid: u32) -> Option>; +} + +pub fn create_syscall() -> Box { + if cfg!(test) { + Box::new(TestHelperSyscall::default()) + } else { + Box::new(LinuxSyscall) + } } diff --git a/src/command/test.rs b/src/command/test.rs index fe5540d14b..eaa7c3ac00 100644 --- a/src/command/test.rs +++ b/src/command/test.rs @@ -1,21 +1,21 @@ -use std::{any::Any, cell::RefCell}; +use std::{any::Any, cell::RefCell, ffi::OsStr, sync::Arc}; use caps::{errors::CapsError, CapSet, CapsHashSet}; use nix::sched::CloneFlags; use oci_spec::LinuxRlimit; -use super::Command; +use super::Syscall; #[derive(Clone)] -pub struct TestHelperCommand { +pub struct TestHelperSyscall { set_ns_args: RefCell>, unshare_args: RefCell>, set_capability_args: RefCell>, } -impl Default for TestHelperCommand { +impl Default for TestHelperSyscall { fn default() -> Self { - TestHelperCommand { + TestHelperSyscall { set_ns_args: RefCell::new(vec![]), unshare_args: RefCell::new(vec![]), set_capability_args: RefCell::new(vec![]), @@ -23,7 +23,7 @@ impl Default for TestHelperCommand { } } -impl Command for TestHelperCommand { +impl Syscall for TestHelperSyscall { fn as_any(&self) -> &dyn Any { self } @@ -60,9 +60,13 @@ impl Command for TestHelperCommand { fn set_rlimit(&self, _rlimit: &LinuxRlimit) -> anyhow::Result<()> { todo!() } + + fn get_pwuid(&self, _: u32) -> Option> { + todo!() + } } -impl TestHelperCommand { +impl TestHelperSyscall { pub fn get_setns_args(&self) -> Vec<(i32, CloneFlags)> { self.set_ns_args.borrow_mut().clone() } diff --git a/src/container/builder.rs b/src/container/builder.rs new file mode 100644 index 0000000000..70838572e5 --- /dev/null +++ b/src/container/builder.rs @@ -0,0 +1,132 @@ +use crate::command::linux::LinuxSyscall; +use std::path::PathBuf; + +use super::{init_builder::InitContainerBuilder, tenant_builder::TenantContainerBuilder}; +pub struct ContainerBuilder { + /// Id of the container + pub(super) container_id: String, + /// Root directory for container state + pub(super) root_path: PathBuf, + /// Interface to operating system primitives + pub(super) syscall: LinuxSyscall, + /// File which will be used to communicate the pid of the + /// container process to the higher level runtime + pub(super) pid_file: Option, + /// Socket to communicate the file descriptor of the ptty + pub(super) console_socket: Option, +} + +/// Builder that can be used to configure the common properties of +/// either a init or a tenant container +/// +/// # Example +/// +/// ```no_run +/// use youki::container::builder::ContainerBuilder; +/// +/// ContainerBuilder::new("74f1a4cb3801".to_owned()) +/// .with_root_path("/run/containers/youki") +/// .with_pid_file("/var/run/docker.pid") +/// .with_console_socket("/var/run/docker/sock.tty") +/// .as_init("/var/run/docker/bundle") +/// .build(); +/// ``` +impl ContainerBuilder { + /// Generates the base configuration for a container which can be + /// transformed into either a init container or a tenant container + /// + /// # Example + /// + /// ```no_run + /// use youki::container::builder::ContainerBuilder; + /// + /// let builder = ContainerBuilder::new("74f1a4cb3801".to_owned()); + /// ``` + pub fn new(container_id: String) -> Self { + let root_path = PathBuf::from("/run/youki"); + + Self { + container_id, + root_path, + syscall: LinuxSyscall, + pid_file: None, + console_socket: None, + } + } + + /// Transforms this builder into a tenant builder + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .as_tenant() + /// .with_container_args(vec!["sleep".to_owned(), "9001".to_owned()]) + /// .build(); + /// ``` + #[allow(clippy::wrong_self_convention)] + pub fn as_tenant(self) -> TenantContainerBuilder { + TenantContainerBuilder::new(self) + } + + /// Transforms this builder into an init builder + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .as_init("/var/run/docker/bundle") + /// .with_systemd(false) + /// .build(); + /// ``` + #[allow(clippy::wrong_self_convention)] + pub fn as_init>(self, bundle: P) -> InitContainerBuilder { + InitContainerBuilder::new(self, bundle.into()) + } + + /// Sets the root path which will be used to store the container state + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_root_path("/run/containers/youki"); + /// ``` + pub fn with_root_path>(mut self, path: P) -> Self { + self.root_path = path.into(); + self + } + + /// Sets the pid file which will be used to write the pid of the container + /// process + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_pid_file("/var/run/docker.pid"); + /// ``` + pub fn with_pid_file>(mut self, path: P) -> Self { + self.pid_file = Some(path.into()); + self + } + + /// Sets the console socket, which will be used to send the file descriptor + /// of the pseudoterminal + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_console_socket("/var/run/docker/sock.tty"); + /// ``` + pub fn with_console_socket>(mut self, path: P) -> Self { + self.console_socket = Some(path.into()); + self + } +} diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs new file mode 100644 index 0000000000..d93a14ed8a --- /dev/null +++ b/src/container/builder_impl.rs @@ -0,0 +1,145 @@ +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use nix::{ + sched, + unistd::{Gid, Uid}, +}; +use oci_spec::Spec; + +use crate::{ + cgroups, + command::{linux::LinuxSyscall, Syscall}, + namespaces::Namespaces, + notify_socket::NotifyListener, + process::{fork, setup_init_process, Process}, + rootless::Rootless, + stdio::FileDescriptor, + tty, utils, +}; + +use super::{Container, ContainerStatus}; + +pub(super) struct ContainerBuilderImpl { + /// Flag indicating if an init or a tenant container should be created + pub init: bool, + /// Interface to operating system primitives + pub syscall: LinuxSyscall, + /// Flag indicating if systemd should be used for cgroup management + pub use_systemd: bool, + /// Id of the container + pub container_id: String, + /// Directory where the state of the container will be stored + pub container_dir: PathBuf, + /// OCI complient runtime spec + pub spec: Spec, + /// Root filesystem of the container + pub rootfs: PathBuf, + /// File which will be used to communicate the pid of the + /// container process to the higher level runtime + pub pid_file: Option, + /// Socket to communicate the file descriptor of the ptty + pub console_socket: Option, + /// Options for rootless containers + pub rootless: Option, + /// Socket to communicate container start + pub notify_socket: NotifyListener, + /// Container state + pub container: Option, +} + +impl ContainerBuilderImpl { + pub(super) fn create(&mut self) -> Result<()> { + if let Process::Parent(_) = self.run_container()? { + if self.init { + std::process::exit(0); + } + } + + Ok(()) + } + + fn run_container(&mut self) -> Result { + prctl::set_dumpable(false).unwrap(); + + let linux = self.spec.linux.as_ref().unwrap(); + let namespaces: Namespaces = linux.namespaces.clone().into(); + + let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, &self.container_id); + let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, self.use_systemd)?; + + // first fork, which creates process, which will later create actual container process + match fork::fork_first( + self.init, + &self.pid_file, + &self.rootless, + linux, + self.container.as_ref(), + cmanager, + )? { + // In the parent process, which called run_container + Process::Parent(parent) => Ok(Process::Parent(parent)), + // in child process + Process::Child(child) => { + // set limits and namespaces to the process + for rlimit in self.spec.process.rlimits.iter() { + self.syscall + .set_rlimit(rlimit) + .context("failed to set rlimit")?; + } + self.syscall + .set_id(Uid::from_raw(0), Gid::from_raw(0)) + .context("failed to become root")?; + + let without = sched::CloneFlags::CLONE_NEWUSER; + namespaces + .apply_unshare(without) + .context("could not unshare namespaces")?; + + // set up tty if specified + if let Some(csocketfd) = &self.console_socket { + tty::setup_console(csocketfd)?; + } + + // set namespaces + namespaces.apply_setns()?; + + // fork second time, which will later create container + match fork::fork_init(child)? { + Process::Child(_child) => unreachable!(), + // This is actually the child process after fork + Process::Init(mut init) => { + // prepare process + if self.init { + setup_init_process( + &self.spec, + &self.syscall, + self.rootfs.clone(), + &namespaces, + )?; + } + + init.ready()?; + self.notify_socket.wait_for_container_start()?; + // actually run the command / program to be run in container + let args: &Vec = &self.spec.process.args; + let envs: &Vec = &self.spec.process.env; + utils::do_exec(&args[0], args, envs)?; + + if let Some(container) = &self.container { + // the command / program is done executing + container + .refresh_state()? + .update_status(ContainerStatus::Stopped) + .save()?; + } + + Ok(Process::Init(init)) + } + Process::Parent(_) => unreachable!(), + } + } + _ => unreachable!(), + } + } +} diff --git a/src/container/container.rs b/src/container/container.rs index 2fbce6198a..72bbe12bf4 100644 --- a/src/container/container.rs +++ b/src/container/container.rs @@ -1,15 +1,25 @@ +use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; use anyhow::Result; +use chrono::DateTime; use nix::unistd::Pid; + +use chrono::Utc; +use oci_spec::Spec; use procfs::process::Process; +use crate::command::syscall::create_syscall; + use crate::container::{ContainerStatus, State}; +/// Structure representing the container data #[derive(Debug)] pub struct Container { + // State of the container pub state: State, + // indicated the directory for the root path in the container pub root: PathBuf, } @@ -36,16 +46,20 @@ impl Container { pub fn status(&self) -> ContainerStatus { self.state.status } - - pub fn refresh_status(&self) -> Result { + pub fn refresh_status(&mut self) -> Result { let new_status = match self.pid() { Some(pid) => { + // Note that Process::new does not spawn a new process + // but instead creates a new Process structure, and fill + // it with information about the process with given pid if let Ok(proc) = Process::new(pid.as_raw()) { use procfs::process::ProcState; match proc.stat.state().unwrap() { ProcState::Zombie | ProcState::Dead => ContainerStatus::Stopped, _ => match self.status() { - ContainerStatus::Creating | ContainerStatus::Created => self.status(), + ContainerStatus::Creating + | ContainerStatus::Created + | ContainerStatus::Paused => self.status(), _ => ContainerStatus::Running, }, } @@ -55,11 +69,19 @@ impl Container { } None => ContainerStatus::Stopped, }; - self.update_status(new_status) + Ok(self.update_status(new_status)) + } + + pub fn refresh_state(&self) -> Result { + let state = State::load(&self.root)?; + Ok(Self { + state, + root: self.root.clone(), + }) } pub fn save(&self) -> Result<()> { - log::debug!("Sava container status: {:?} in {:?}", self, self.root); + log::debug!("Save container status: {:?} in {:?}", self, self.root); self.state.save(&self.root) } @@ -75,29 +97,80 @@ impl Container { self.state.status.can_delete() } + pub fn can_exec(&self) -> bool { + self.state.status == ContainerStatus::Running + } + + pub fn can_pause(&self) -> bool { + self.state.status.can_pause() + } + + pub fn can_resume(&self) -> bool { + self.state.status.can_resume() + } + pub fn pid(&self) -> Option { self.state.pid.map(Pid::from_raw) } pub fn set_pid(&self, pid: i32) -> Self { - Self::new( - self.state.id.as_str(), - self.state.status, - Some(pid), - self.state.bundle.as_str(), - &self.root, - ) - .expect("unexpected error") - } - - pub fn update_status(&self, status: ContainerStatus) -> Result { - Self::new( - self.state.id.as_str(), - status, - self.state.pid, - self.state.bundle.as_str(), - &self.root, - ) + let mut new_state = self.state.clone(); + new_state.pid = Some(pid); + + Self { + state: new_state, + root: self.root.clone(), + } + } + + pub fn created(&self) -> Option> { + self.state.created + } + + pub fn set_creator(mut self, uid: u32) -> Self { + self.state.creator = Some(uid); + self + } + + pub fn creator(&self) -> Option { + if let Some(uid) = self.state.creator { + let command = create_syscall(); + let user_name = command.get_pwuid(uid); + if let Some(user_name) = user_name { + return Some((&*user_name).to_owned()); + } + } + + None + } + + pub fn bundle(&self) -> String { + self.state.bundle.clone() + } + + pub fn set_systemd(mut self, should_use: bool) -> Self { + self.state.use_systemd = Some(should_use); + self + } + + pub fn systemd(&self) -> Option { + self.state.use_systemd + } + + pub fn update_status(&self, status: ContainerStatus) -> Self { + let created = match (status, self.state.created) { + (ContainerStatus::Created, None) => Some(Utc::now()), + _ => self.state.created, + }; + + let mut new_state = self.state.clone(); + new_state.created = created; + new_state.status = status; + + Self { + state: new_state, + root: self.root.clone(), + } } pub fn load(container_root: PathBuf) -> Result { @@ -107,4 +180,8 @@ impl Container { root: container_root, }) } + + pub fn spec(&self) -> Result { + Spec::load(self.root.join("config.json")) + } } diff --git a/src/container/init_builder.rs b/src/container/init_builder.rs new file mode 100644 index 0000000000..a6d1089ba0 --- /dev/null +++ b/src/container/init_builder.rs @@ -0,0 +1,129 @@ +use anyhow::{bail, Context, Result}; +use nix::unistd; +use oci_spec::Spec; +use rootless::detect_rootless; +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use crate::{ + notify_socket::{NotifyListener, NOTIFY_FILE}, + rootless, tty, utils, +}; + +use super::{ + builder::ContainerBuilder, builder_impl::ContainerBuilderImpl, Container, ContainerStatus, +}; + +// Builder that can be used to configure the properties of a new container +pub struct InitContainerBuilder { + base: ContainerBuilder, + bundle: PathBuf, + use_systemd: bool, +} + +impl InitContainerBuilder { + /// Generates the base configuration for a new container from which + /// configuration methods can be chained + pub(super) fn new(builder: ContainerBuilder, bundle: PathBuf) -> Self { + Self { + base: builder, + bundle, + use_systemd: true, + } + } + + /// Sets if systemd should be used for managing cgroups + pub fn with_systemd(mut self, should_use: bool) -> Self { + self.use_systemd = should_use; + self + } + + /// Creates a new container + pub fn build(self) -> Result<()> { + let container_dir = self.create_container_dir()?; + let spec = self.load_and_safeguard_spec(&container_dir)?; + + unistd::chdir(&*container_dir)?; + let container_state = self + .create_container_state(&container_dir)? + .set_systemd(self.use_systemd); + + let notify_socket: NotifyListener = NotifyListener::new(NOTIFY_FILE)?; + // convert path of root file system of the container to absolute path + let rootfs = fs::canonicalize(&spec.root.path)?; + + // if socket file path is given in commandline options, + // get file descriptors of console socket + let csocketfd = if let Some(console_socket) = &self.base.console_socket { + Some(tty::setup_console_socket( + &container_dir, + console_socket, + "console-socket", + )?) + } else { + None + }; + + let rootless = detect_rootless(&spec)?; + + let mut builder_impl = ContainerBuilderImpl { + init: true, + syscall: self.base.syscall, + container_id: self.base.container_id, + pid_file: self.base.pid_file, + console_socket: csocketfd, + use_systemd: self.use_systemd, + container_dir, + spec, + rootfs, + rootless, + notify_socket, + container: Some(container_state), + }; + + builder_impl.create()?; + Ok(()) + } + + fn create_container_dir(&self) -> Result { + let container_dir = self.base.root_path.join(&self.base.container_id); + log::debug!("container directory will be {:?}", container_dir); + + if container_dir.exists() { + bail!("container {} already exists", self.base.container_id); + } + + utils::create_dir_all(&container_dir)?; + Ok(container_dir) + } + + fn load_and_safeguard_spec(&self, container_dir: &Path) -> Result { + let source_spec_path = self.bundle.join("config.json"); + let target_spec_path = container_dir.join("config.json"); + fs::copy(&source_spec_path, &target_spec_path).with_context(|| { + format!( + "failed to copy {:?} to {:?}", + source_spec_path, target_spec_path + ) + })?; + + let mut spec = oci_spec::Spec::load(&target_spec_path)?; + unistd::chdir(&self.bundle)?; + spec.canonicalize_rootfs()?; + Ok(spec) + } + + fn create_container_state(&self, container_dir: &Path) -> Result { + let container = Container::new( + &self.base.container_id, + ContainerStatus::Creating, + None, + self.bundle.as_path().to_str().unwrap(), + &container_dir, + )?; + container.save()?; + Ok(container) + } +} diff --git a/src/container/mod.rs b/src/container/mod.rs index e0f2d5071b..3c71e82a01 100644 --- a/src/container/mod.rs +++ b/src/container/mod.rs @@ -1,5 +1,15 @@ +//! Container management +/// This crate is responsible for the creation of containers. It provides a builder that can +/// be used to configure and create containers. We distinguish between an init container for which +/// namespaces and cgroups will be created (usually) and a tenant container process that will move +/// into the existing namespaces and cgroups of the initial container process (e.g. used to implement +/// the exec command). +pub mod builder; +mod builder_impl; #[allow(clippy::module_inception)] mod container; +pub mod init_builder; mod state; +pub mod tenant_builder; pub use container::Container; pub use state::{ContainerStatus, State}; diff --git a/src/container/state.rs b/src/container/state.rs index c465ab215e..6b294f5573 100644 --- a/src/container/state.rs +++ b/src/container/state.rs @@ -1,24 +1,29 @@ +//! Information about status and state of the container use std::collections::HashMap; +use std::fmt::Display; use std::fs; use std::{fs::File, path::Path}; -use anyhow::Result; +use anyhow::{Context, Result}; +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; const STATE_FILE_PATH: &str = "state.json"; -#[derive(Serialize, Deserialize, Debug, Copy, Clone)] +/// Indicates status of the container +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq)] #[serde(rename_all = "camelCase")] pub enum ContainerStatus { - // StateCreating indicates that the container is being created + // The container is being created Creating, - // StateCreated indicates that the runtime has finished the create operation + // The runtime has finished the create operation Created, - // StateRunning indicates that the container process has executed the - // user-specified program but has not exited + // The container process has executed the user-specified program but has not exited Running, - // StateStopped indicates that the container process has exited + // The container process has exited Stopped, + // The container process has paused + Paused, } impl ContainerStatus { @@ -30,15 +35,38 @@ impl ContainerStatus { use ContainerStatus::*; match self { Creating | Stopped => false, - Created | Running => true, + Created | Running | Paused => true, } } pub fn can_delete(&self) -> bool { matches!(self, ContainerStatus::Stopped) } + + pub fn can_pause(&self) -> bool { + matches!(self, ContainerStatus::Running) + } + + pub fn can_resume(&self) -> bool { + matches!(self, ContainerStatus::Paused) + } +} + +impl Display for ContainerStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let print = match *self { + Self::Creating => "Creating", + Self::Created => "Created", + Self::Running => "Running", + Self::Stopped => "Stopped", + Self::Paused => "Paused", + }; + + write!(f, "{}", print) + } } +/// Stores the state information of the container #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub struct State { @@ -55,6 +83,14 @@ pub struct State { pub bundle: String, // Annotations are key values associated with the container. pub annotations: HashMap, + // Creation time of the container + #[serde(skip_serializing_if = "Option::is_none")] + pub created: Option>, + // User that created the container + #[serde(skip_serializing_if = "Option::is_none")] + pub creator: Option, + // Specifies if systemd should be used to manage cgroups + pub use_systemd: Option, } impl State { @@ -71,6 +107,9 @@ impl State { pid, bundle: bundle.to_string(), annotations: HashMap::default(), + created: None, + creator: None, + use_systemd: None, } } @@ -90,7 +129,9 @@ impl State { pub fn load(container_root: &Path) -> Result { let state_file_path = container_root.join(STATE_FILE_PATH); - let file = File::open(state_file_path)?; + let file = File::open(&state_file_path).with_context(|| { + format!("failed to open container state file {:?}", state_file_path) + })?; let state: Self = serde_json::from_reader(&file)?; Ok(state) } diff --git a/src/container/tenant_builder.rs b/src/container/tenant_builder.rs new file mode 100644 index 0000000000..139e5a1fff --- /dev/null +++ b/src/container/tenant_builder.rs @@ -0,0 +1,383 @@ +use anyhow::{bail, Context, Result}; +use caps::Capability; +use nix::unistd; +use oci_spec::{ + LinuxCapabilities, LinuxCapabilityType, LinuxNamespace, LinuxNamespaceType, Process, Spec, +}; + +use std::{ + collections::HashMap, + convert::TryFrom, + ffi::{CString, OsString}, + fs, + os::unix::prelude::OsStrExt, + path::{Path, PathBuf}, + str::FromStr, +}; + +use crate::{ + notify_socket::{NotifyListener, NotifySocket}, + rootless::detect_rootless, + stdio::FileDescriptor, + tty, utils, +}; + +use super::{builder::ContainerBuilder, builder_impl::ContainerBuilderImpl, Container}; + +const NAMESPACE_TYPES: &[&str] = &["ipc", "uts", "net", "pid", "mnt", "cgroup"]; +const TENANT_NOTIFY: &str = "tenant-notify-"; +const TENANT_TTY: &str = "tenant-tty-"; + +/// Builder that can be used to configure the properties of a process +/// that will join an existing container sandbox +pub struct TenantContainerBuilder { + base: ContainerBuilder, + env: HashMap, + cwd: Option, + args: Vec, + no_new_privs: Option, + capabilities: Vec, + process: Option, +} + +impl TenantContainerBuilder { + /// Generates the base configuration for a process that will join + /// an existing container sandbox from which configuration methods + /// can be chained + pub(super) fn new(builder: ContainerBuilder) -> Self { + Self { + base: builder, + env: HashMap::new(), + cwd: None, + args: Vec::new(), + no_new_privs: None, + capabilities: Vec::new(), + process: None, + } + } + + /// Sets environment variables for the container + pub fn with_env(mut self, env: HashMap) -> Self { + self.env = env; + self + } + + /// Sets the working directory of the container + pub fn with_cwd>(mut self, path: P) -> Self { + self.cwd = Some(path.into()); + self + } + + /// Sets the command the container will be started with + pub fn with_container_args(mut self, args: Vec) -> Self { + self.args = args; + self + } + + pub fn with_no_new_privs(mut self, no_new_privs: bool) -> Self { + self.no_new_privs = Some(no_new_privs); + self + } + + pub fn with_capabilities(mut self, capabilities: Vec) -> Self { + self.capabilities = capabilities; + self + } + + pub fn with_process>(mut self, path: P) -> Self { + self.process = Some(path.into()); + self + } + + /// Joins an existing container + pub fn build(self) -> Result<()> { + let container_dir = self.lookup_container_dir()?; + let container = self.load_container_state(container_dir.clone())?; + let mut spec = self.load_init_spec(&container_dir)?; + self.adapt_spec_for_tenant(&mut spec, &container)?; + log::debug!("{:#?}", spec); + + unistd::chdir(&*container_dir)?; + let (notify_listener, notify_path) = Self::setup_notify_listener(&container_dir)?; + // convert path of root file system of the container to absolute path + let rootfs = fs::canonicalize(&spec.root.path)?; + + // if socket file path is given in commandline options, + // get file descriptors of console socket + let csocketfd = self.setup_tty_socket(&container_dir)?; + + let use_systemd = self.should_use_systemd(&container); + let rootless = detect_rootless(&spec)?; + + let mut builder_impl = ContainerBuilderImpl { + init: false, + syscall: self.base.syscall, + container_id: self.base.container_id, + pid_file: self.base.pid_file, + console_socket: csocketfd, + use_systemd, + container_dir, + spec, + rootfs, + rootless, + notify_socket: notify_listener, + container: None, + }; + + builder_impl.create()?; + + let mut notify_socket = NotifySocket::new(notify_path); + notify_socket.notify_container_start()?; + Ok(()) + } + + fn lookup_container_dir(&self) -> Result { + let container_dir = self.base.root_path.join(&self.base.container_id); + if !container_dir.exists() { + bail!("container {} does not exist", self.base.container_id); + } + + Ok(container_dir) + } + + fn load_init_spec(&self, container_dir: &Path) -> Result { + let spec_path = container_dir.join("config.json"); + + let spec = oci_spec::Spec::load(spec_path).context("failed to load spec")?; + Ok(spec) + } + + fn load_container_state(&self, container_dir: PathBuf) -> Result { + let container = Container::load(container_dir)?.refresh_status()?; + if !container.can_exec() { + bail!( + "Cannot exec as container is in state {}", + container.status() + ); + } + + Ok(container) + } + + fn adapt_spec_for_tenant(&self, spec: &mut Spec, container: &Container) -> Result<()> { + if let Some(ref process) = self.process { + self.set_process(spec, process)?; + } else { + self.set_working_dir(spec)?; + self.set_args(spec)?; + self.set_environment(spec)?; + self.set_no_new_privileges(spec); + self.set_capabilities(spec)?; + } + + if container.pid().is_none() { + bail!("Could not retrieve container init pid"); + } + + let init_process = procfs::process::Process::new(container.pid().unwrap().as_raw())?; + self.set_namespaces(spec, init_process.namespaces()?)?; + + Ok(()) + } + + fn set_process(&self, spec: &mut Spec, process: &Path) -> Result<()> { + if !process.exists() { + bail!( + "Process.json file does not exist at specified path {}", + process.display() + ) + } + + let process = utils::open(process)?; + let process_spec: Process = serde_json::from_reader(process)?; + spec.process = process_spec; + Ok(()) + } + + fn set_working_dir(&self, spec: &mut Spec) -> Result<()> { + if let Some(ref cwd) = self.cwd { + if cwd.is_relative() { + bail!( + "Current working directory must be an absolute path, but is {}", + cwd.display() + ); + } + + spec.process.cwd = cwd.to_string_lossy().to_string(); + } + + Ok(()) + } + + fn set_args(&self, spec: &mut Spec) -> Result<()> { + if self.args.is_empty() { + bail!("Container command was not specified") + } + + spec.process.args = self.args.clone(); + Ok(()) + } + + fn set_environment(&self, spec: &mut Spec) -> Result<()> { + spec.process.env.append( + &mut self + .env + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect(), + ); + + Ok(()) + } + + fn set_no_new_privileges(&self, spec: &mut Spec) { + if let Some(no_new_privs) = self.no_new_privs { + spec.process.no_new_privileges = no_new_privs; + } + } + + fn set_capabilities(&self, spec: &mut Spec) -> Result<()> { + if !self.capabilities.is_empty() { + let mut caps: Vec = Vec::with_capacity(self.capabilities.len()); + for cap in &self.capabilities { + caps.push(Capability::from_str(cap)?.into()); + } + + if let Some(ref mut spec_caps) = spec.process.capabilities { + spec_caps.ambient.append(&mut caps.clone()); + spec_caps.bounding.append(&mut caps.clone()); + spec_caps.effective.append(&mut caps.clone()); + spec_caps.inheritable.append(&mut caps.clone()); + spec_caps.permitted.append(&mut caps); + } else { + spec.process.capabilities = Some(LinuxCapabilities { + ambient: caps.clone(), + bounding: caps.clone(), + effective: caps.clone(), + inheritable: caps.clone(), + permitted: caps, + }) + } + } + + Ok(()) + } + + fn set_namespaces(&self, spec: &mut Spec, init_namespaces: Vec) -> Result<()> { + let mut tenant_namespaces = Vec::with_capacity(init_namespaces.len()); + + for ns_type in NAMESPACE_TYPES.iter().copied() { + if let Some(init_ns) = init_namespaces.iter().find(|n| n.ns_type.eq(ns_type)) { + let tenant_ns = LinuxNamespaceType::try_from(ns_type)?; + tenant_namespaces.push(LinuxNamespace { + typ: tenant_ns, + path: Some(init_ns.path.to_string_lossy().to_string()), + }) + } + } + + let mut linux = spec.linux.as_mut().unwrap(); + linux.namespaces = tenant_namespaces; + Ok(()) + } + + fn should_use_systemd(&self, container: &Container) -> bool { + if let Some(use_systemd) = container.systemd() { + return use_systemd; + } + + false + } + + fn setup_notify_listener(container_dir: &Path) -> Result<(NotifyListener, PathBuf)> { + let notify_name = Self::generate_name(&container_dir, TENANT_NOTIFY); + let socket_path = container_dir.join(¬ify_name); + let notify_listener: NotifyListener = NotifyListener::new(¬ify_name)?; + + Ok((notify_listener, socket_path)) + } + + fn setup_tty_socket(&self, container_dir: &Path) -> Result> { + let tty_name = Self::generate_name(&container_dir, TENANT_TTY); + let csocketfd = if let Some(console_socket) = &self.base.console_socket { + Some(tty::setup_console_socket( + container_dir, + console_socket, + &tty_name, + )?) + } else { + None + }; + + Ok(csocketfd) + } + + fn generate_name(dir: &Path, prefix: &str) -> String { + loop { + let rand = fastrand::i32(..); + let name = format!("{}{:x}.sock", prefix, rand); + if !dir.join(&name).exists() { + return name; + } + } + } +} + +// Can be removed once https://github.com/eminence/procfs/pull/135 is available +trait GetNamespace { + fn namespaces(&self) -> Result>; +} + +impl GetNamespace for procfs::process::Process { + /// Describes namespaces to which the process with the corresponding PID belongs. + /// Doc reference: https://man7.org/linux/man-pages/man7/namespaces.7.html + fn namespaces(&self) -> Result> { + let proc_path = PathBuf::from(format!("/proc/{}", self.pid())); + let ns = proc_path.join("ns"); + let mut namespaces = Vec::new(); + for entry in fs::read_dir(ns)? { + let entry = entry?; + let path = entry.path(); + let ns_type = entry.file_name(); + let cstr = CString::new(path.as_os_str().as_bytes()).unwrap(); + + let mut stat = unsafe { std::mem::zeroed() }; + if unsafe { libc::stat(cstr.as_ptr(), &mut stat) } != 0 { + bail!("Unable to stat {:?}", path); + } + + namespaces.push(Namespace { + ns_type, + path, + identifier: stat.st_ino, + device_id: stat.st_dev, + }) + } + + Ok(namespaces) + } +} + +/// Information about a namespace +/// +/// See also the [Process::namespaces()] method +#[derive(Debug, Clone)] +pub struct Namespace { + /// Namespace type + pub ns_type: OsString, + /// Handle to the namespace + pub path: PathBuf, + /// Namespace identifier (inode number) + pub identifier: u64, + /// Device id of the namespace + pub device_id: u64, +} + +impl PartialEq for Namespace { + fn eq(&self, other: &Self) -> bool { + // see https://lore.kernel.org/lkml/87poky5ca9.fsf@xmission.com/ + self.identifier == other.identifier && self.device_id == other.device_id + } +} + +impl Eq for Namespace {} diff --git a/src/create.rs b/src/create.rs index 6098cb624e..d0f6b8471c 100644 --- a/src/create.rs +++ b/src/create.rs @@ -1,182 +1,51 @@ -use std::fs; -use std::path::{Path, PathBuf}; -use std::process; - -use anyhow::{bail, Result}; +//! Handles the creation of a new container +use anyhow::Result; use clap::Clap; -use nix::sched; -use nix::unistd; -use nix::unistd::{Gid, Uid}; +use std::path::PathBuf; -use crate::cgroups; -use crate::container::{Container, ContainerStatus}; -use crate::namespaces::Namespaces; -use crate::notify_socket::NotifyListener; -use crate::process::{fork, Process}; -use crate::rootfs; -use oci_spec; -use crate::stdio::FileDescriptor; -use crate::tty; -use crate::utils; -use crate::{capabilities, command::Command}; +use crate::container::builder::ContainerBuilder; +/// This is the main structure which stores various commandline options given by +/// high-level container runtime #[derive(Clap, Debug)] pub struct Create { + /// File to write pid of the container created + // note that in the end, container is just another process #[clap(short, long)] pid_file: Option, + /// path to the bundle directory, containing config.json and root filesystem #[clap(short, long, default_value = ".")] bundle: PathBuf, + /// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal #[clap(short, long)] - console_socket: Option, + console_socket: Option, + /// name of the container instance to be started pub container_id: String, } +// One thing to note is that in the end, container is just another process in Linux +// it has specific/different control group, namespace, using which program executing in it +// can be given impression that is is running on a complete system, but on the system which +// it is running, it is just another process, and has attributes such as pid, file descriptors, etc. +// associated with it like any other process. impl Create { - pub fn exec(&self, root_path: PathBuf, command: impl Command) -> Result<()> { - let container_dir = root_path.join(&self.container_id); - if !container_dir.exists() { - fs::create_dir(&container_dir).unwrap(); - } else { - bail!("{} already exists", self.container_id) - } - - unistd::chdir(&self.bundle)?; - - let spec = oci_spec::Spec::load("config.json")?; - fs::copy("config.json", container_dir.join("config.json"))?; - log::debug!("spec: {:?}", spec); - - let container_dir = fs::canonicalize(container_dir)?; - unistd::chdir(&*container_dir)?; - - log::debug!("{:?}", &container_dir); - let container = Container::new( - &self.container_id, - ContainerStatus::Creating, - None, - self.bundle.to_str().unwrap(), - &container_dir, - )?; - container.save()?; - - let mut notify_socket: NotifyListener = NotifyListener::new(&container_dir)?; - - let rootfs = fs::canonicalize(&spec.root.path)?; - - let (csocketfd, _consolefd) = { - if let Some(console_socket) = &self.console_socket { - let (csocketfd, consolefd) = - tty::load_console_sockets(&container_dir, console_socket)?; - (Some(csocketfd), Some(consolefd)) - } else { - (None, None) - } - }; - - let process = run_container( - self.pid_file.as_ref(), - &mut notify_socket, - rootfs, - spec, - csocketfd, - container, - command, - )?; - if let Process::Parent(_) = process { - process::exit(0); + /// Starts a new container process + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + let mut builder = ContainerBuilder::new(self.container_id.clone()); + if let Some(pid_file) = &self.pid_file { + builder = builder.with_pid_file(pid_file); } - Ok(()) - } -} - -fn run_container>( - pid_file: Option

, - notify_socket: &mut NotifyListener, - rootfs: PathBuf, - spec: oci_spec::Spec, - csocketfd: Option, - container: Container, - command: impl Command, -) -> Result { - prctl::set_dumpable(false).unwrap(); - let linux = spec.linux.as_ref().unwrap(); - let namespaces: Namespaces = linux.namespaces.clone().into(); - let cmanager = cgroups::Manager::new(linux.cgroups_path.clone())?; - - match fork::fork_first( - pid_file, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - linux, - &container, - &cmanager, - )? { - Process::Parent(parent) => Ok(Process::Parent(parent)), - Process::Child(child) => { - for rlimit in spec.process.rlimits.iter() { - command.set_rlimit(rlimit)? - } - command.set_id(Uid::from_raw(0), Gid::from_raw(0))?; - - let without = sched::CloneFlags::CLONE_NEWUSER; - namespaces.apply_unshare(without)?; - - if let Some(csocketfd) = csocketfd { - tty::ready(csocketfd)?; - } - - namespaces.apply_setns()?; - - match fork::fork_init(child)? { - Process::Child(child) => Ok(Process::Child(child)), - Process::Init(mut init) => { - let spec_args: &Vec = &spec.process.args.clone(); - let envs: &Vec = &spec.process.env.clone(); - init_process(spec, command, rootfs, namespaces)?; - init.ready()?; - notify_socket.wait_for_container_start()?; - - utils::do_exec(&spec_args[0], spec_args, envs)?; - container.update_status(ContainerStatus::Stopped)?.save()?; - - Ok(Process::Init(init)) - } - Process::Parent(_) => unreachable!(), - } + if let Some(console_socket) = &self.console_socket { + builder = builder.with_console_socket(console_socket); } - _ => unreachable!(), - } -} -fn init_process( - spec: oci_spec::Spec, - command: impl Command, - rootfs: PathBuf, - namespaces: Namespaces, -) -> Result<()> { - let proc = spec.process.clone(); - - command.set_hostname(&spec.hostname.as_str())?; - if spec.process.no_new_privileges { - let _ = prctl::set_no_new_privileges(true); - } + builder + .with_root_path(root_path) + .as_init(&self.bundle) + .with_systemd(systemd_cgroup) + .build()?; - rootfs::prepare_rootfs( - &spec, - &rootfs, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - )?; - - command.pivot_rootfs(&rootfs)?; - - command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; - capabilities::reset_effective(&command)?; - if let Some(caps) = &proc.capabilities { - capabilities::drop_privileges(&caps, &command)?; + Ok(()) } - Ok(()) } diff --git a/src/dbus/client.rs b/src/dbus/client.rs new file mode 100644 index 0000000000..b0dc4afef1 --- /dev/null +++ b/src/dbus/client.rs @@ -0,0 +1,33 @@ +use anyhow::Result; +use dbus::blocking::Connection; +use std::time::Duration; +use std::vec::Vec; + +/// Client is a wrapper providing higher level API and abatraction around dbus. +/// For more information see https://www.freedesktop.org/wiki/Software/systemd/dbus/ +pub struct Client { + conn: Connection, +} + +impl Client { + pub fn new() -> Result { + let conn = Connection::new_session()?; + Ok(Client { conn }) + } + + /// start_unit starts a specific unit under systemd. See https://www.freedesktop.org/wiki/Software/systemd/dbus + /// for more details. + pub fn start_unit(&self, unit_name: &str, _properties: Vec<&str>) -> Result<()> { + let proxy = self.conn.with_proxy( + "org.freedesktop.systemd1.Manager", + "/", + Duration::from_millis(5000), + ); + let (_job_id,): (i32,) = proxy.method_call( + "org.freedesktop.systemd1.Manager", + "StartTransientUnit", + (unit_name, "replace"), + )?; + Ok(()) + } +} diff --git a/src/dbus/mod.rs b/src/dbus/mod.rs new file mode 100644 index 0000000000..e99ee79b96 --- /dev/null +++ b/src/dbus/mod.rs @@ -0,0 +1,2 @@ +mod client; +pub use client::Client; diff --git a/src/delete.rs b/src/delete.rs new file mode 100644 index 0000000000..0c35b64486 --- /dev/null +++ b/src/delete.rs @@ -0,0 +1,72 @@ +use std::fs; +use std::path::PathBuf; + +use anyhow::{bail, Result}; +use clap::Clap; +use nix::sys::signal::Signal; + +use crate::cgroups; +use crate::container::{Container, ContainerStatus}; +use crate::utils; +use nix::sys::signal as nix_signal; + +#[derive(Clap, Debug)] +pub struct Delete { + container_id: String, + /// forces deletion of the container if it is still running (using SIGKILL) + #[clap(short, long)] + force: bool, +} + +impl Delete { + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + log::debug!("start deleting {}", self.container_id); + // state of container is stored in a directory named as container id inside + // root directory given in commandline options + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + // load container state from json file, and check status of the container + // it might be possible that delete is invoked on a running container. + log::debug!("load the container from {:?}", container_root); + let mut container = Container::load(container_root)?.refresh_status()?; + if container.can_kill() && self.force { + let sig = Signal::SIGKILL; + log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); + nix_signal::kill(container.pid().unwrap(), sig)?; + container = container.update_status(ContainerStatus::Stopped); + container.save()?; + } + log::debug!("container status: {:?}", container.status()); + if container.can_delete() { + if container.root.exists() { + let config_absolute_path = container.root.join("config.json"); + log::debug!("load spec from {:?}", config_absolute_path); + let spec = oci_spec::Spec::load(config_absolute_path)?; + log::debug!("spec: {:?}", spec); + + // remove the directory storing container state + log::debug!("remove dir {:?}", container.root); + fs::remove_dir_all(&container.root)?; + + let cgroups_path = + utils::get_cgroup_path(&spec.linux.unwrap().cgroups_path, container.id()); + + // remove the cgroup created for the container + // check https://man7.org/linux/man-pages/man7/cgroups.7.html + // creating and removing cgroups section for more information on cgroups + let cmanager = + cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?; + cmanager.remove()?; + } + std::process::exit(0) + } else { + bail!( + "{} could not be deleted because it was {:?}", + container.id(), + container.status() + ) + } + } +} diff --git a/src/exec.rs b/src/exec.rs new file mode 100644 index 0000000000..1efcbb4ab0 --- /dev/null +++ b/src/exec.rs @@ -0,0 +1,86 @@ +use anyhow::Result; +use clap::Clap; +use std::{error::Error, path::PathBuf}; + +use crate::container::builder::ContainerBuilder; + +#[derive(Clap, Debug)] +pub struct Exec { + /// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal + #[clap(long)] + pub console_socket: Option, + #[clap(short, long)] + pub tty: bool, + #[clap(long)] + /// Current working directory of the container + pub cwd: Option, + #[clap(long)] + /// The file to which the pid of the container process should be written to + pub pid_file: Option, + /// Environment variables that should be set in the container + #[clap(short, long, parse(try_from_str = parse_key_val), number_of_values = 1)] + pub env: Vec<(String, String)>, + /// Prevent the process from gaining additional privileges + #[clap(long)] + pub no_new_privs: bool, + /// Path to process.json + #[clap(short, long)] + pub process: Option, + /// Detach from the container process + #[clap(short, long)] + pub detach: bool, + /// Identifier of the container + pub container_id: String, + /// Command that should be executed in the container + #[clap(required = false)] + pub command: Vec, +} + +impl Exec { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + let mut builder = + ContainerBuilder::new(self.container_id.clone()).with_root_path(root_path); + + if let Some(console_socket) = &self.console_socket { + builder = builder.with_console_socket(console_socket); + } + + if let Some(pid_file) = &self.pid_file { + builder = builder.with_pid_file(pid_file); + } + + let mut builder = builder.as_tenant(); + + if let Some(cwd) = &self.cwd { + builder = builder.with_cwd(cwd); + } + + if !self.env.is_empty() { + let env = self.env.clone().into_iter().collect(); + builder = builder.with_env(env) + } + + builder = builder.with_no_new_privs(self.no_new_privs); + + if let Some(process) = &self.process { + builder = builder.with_process(process); + } + + builder.with_container_args(self.command.clone()).build()?; + + Ok(()) + } +} + +fn parse_key_val(s: &str) -> Result<(T, U), Box> +where + T: std::str::FromStr, + T::Err: Error + Send + Sync + 'static, + U: std::str::FromStr, + U::Err: Error + Send + Sync + 'static, +{ + let pos = s + .find('=') + .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{}`", s))?; + Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) +} diff --git a/src/info.rs b/src/info.rs new file mode 100644 index 0000000000..ae585565c3 --- /dev/null +++ b/src/info.rs @@ -0,0 +1,130 @@ +//! Contains functions related to printing information about system running Youki +use std::{fs, path::Path}; + +use anyhow::Result; +use clap::Clap; +use procfs::{CpuInfo, Meminfo}; + +use crate::cgroups; + +#[derive(Clap, Debug)] +pub struct Info {} + +impl Info { + pub fn exec(&self) -> Result<()> { + print_youki(); + print_kernel(); + print_os(); + print_hardware(); + print_cgroups(); + + Ok(()) + } +} + +/// print Version of Youki +pub fn print_youki() { + println!("{:<18}{}", "Version", env!("CARGO_PKG_VERSION")); +} + +/// Print Kernel Release, Version and Architecture +pub fn print_kernel() { + let uname = nix::sys::utsname::uname(); + println!("{:<18}{}", "Kernel-Release", uname.release()); + println!("{:<18}{}", "Kernel-Version", uname.version()); + println!("{:<18}{}", "Architecture", uname.machine()); +} + +/// Prints OS Distribution information +// see https://www.freedesktop.org/software/systemd/man/os-release.html +pub fn print_os() { + if let Some(os) = try_read_os_from("/etc/os-release") { + println!("{:<18}{}", "Operating System", os); + } else if let Some(os) = try_read_os_from("/usr/lib/os-release") { + println!("{:<18}{}", "Operating System", os); + } +} + +/// Helper function to read the OS Distribution info +fn try_read_os_from>(path: P) -> Option { + let os_release = path.as_ref(); + if !os_release.exists() { + return None; + } + + if let Ok(release_content) = fs::read_to_string(path) { + let pretty = find_parameter(&release_content, "PRETTY_NAME"); + + if let Some(pretty) = pretty { + return Some(pretty.trim_matches('"').to_owned()); + } + + let name = find_parameter(&release_content, "NAME"); + let version = find_parameter(&release_content, "VERSION"); + + if let (Some(name), Some(version)) = (name, version) { + return Some(format!( + "{} {}", + name.trim_matches('"'), + version.trim_matches('"') + )); + } + } + + None +} + +/// Helper function to find keyword values in OS info string +fn find_parameter<'a>(content: &'a str, param_name: &str) -> Option<&'a str> { + let param_value = content + .lines() + .find(|l| l.starts_with(param_name)) + .map(|l| l.split_terminator('=').last()); + + if let Some(Some(value)) = param_value { + return Some(value); + } + + None +} + +/// Print Hardware information of system +pub fn print_hardware() { + if let Ok(cpu_info) = CpuInfo::new() { + println!("{:<18}{}", "Cores", cpu_info.num_cores()); + } + + if let Ok(mem_info) = Meminfo::new() { + println!( + "{:<18}{}", + "Total Memory", + mem_info.mem_total / u64::pow(1024, 2) + ); + } +} + +/// Print cgroups info of system +pub fn print_cgroups() { + if let Ok(cgroup_fs) = cgroups::common::get_supported_cgroup_fs() { + let cgroup_fs: Vec = cgroup_fs.into_iter().map(|c| c.to_string()).collect(); + println!("{:<18}{}", "cgroup version", cgroup_fs.join(" and ")); + } + + println!("cgroup mounts"); + if let Ok(v1_mounts) = cgroups::v1::util::list_subsystem_mount_points() { + let mut v1_mounts: Vec = v1_mounts + .iter() + .map(|kv| format!(" {:<16}{}", kv.0, kv.1.display())) + .collect(); + + v1_mounts.sort(); + for cgroup_mount in v1_mounts { + println!("{}", cgroup_mount); + } + } + + let unified = cgroups::v2::util::get_unified_mount_point(); + if let Ok(mount_point) = unified { + println!(" {:<16}{}", "unified", mount_point.display()); + } +} diff --git a/src/kill.rs b/src/kill.rs new file mode 100644 index 0000000000..d413bd92dc --- /dev/null +++ b/src/kill.rs @@ -0,0 +1,47 @@ +//! Contains functionality of kill container command +use std::{fs, path::PathBuf}; + +use anyhow::{bail, Result}; +use clap::Clap; +use nix::sys::signal as nix_signal; + +use crate::{ + container::{Container, ContainerStatus}, + signal::ToSignal, +}; + +#[derive(Clap, Debug)] +pub struct Kill { + container_id: String, + signal: String, +} + +impl Kill { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + // resolves relative paths, symbolic links etc. and get complete path + let root_path = fs::canonicalize(root_path)?; + // state of container is stored in a directory named as container id inside + // root directory given in commandline options + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + + // load container state from json file, and check status of the container + // it might be possible that kill is invoked on a already stopped container etc. + let container = Container::load(container_root)?.refresh_status()?; + if container.can_kill() { + let sig = self.signal.to_signal()?; + log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); + nix_signal::kill(container.pid().unwrap(), sig)?; + container.update_status(ContainerStatus::Stopped).save()?; + std::process::exit(0) + } else { + bail!( + "{} could not be killed because it was {:?}", + container.id(), + container.status() + ) + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 7bc7a06d23..f8ac4d73ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,16 +1,31 @@ +#[cfg(test)] +#[macro_use] +extern crate quickcheck; + pub mod capabilities; pub mod cgroups; pub mod command; -pub mod cond; pub mod container; pub mod create; +pub mod dbus; +pub mod delete; +pub mod exec; +pub mod info; +pub mod kill; +pub mod list; pub mod logger; pub mod namespaces; pub mod notify_socket; +pub mod pause; +pub mod pipe; pub mod process; +pub mod resume; pub mod rootfs; +pub mod rootless; pub mod signal; +pub mod spec_json; pub mod start; +pub mod state; pub mod stdio; pub mod tty; pub mod utils; diff --git a/src/list.rs b/src/list.rs new file mode 100644 index 0000000000..7300740c19 --- /dev/null +++ b/src/list.rs @@ -0,0 +1,71 @@ +//! Contains Functionality of list container command +use std::ffi::OsString; +use std::fs; +use std::io; +use std::io::Write; +use std::path::PathBuf; + +use anyhow::Result; +use chrono::{DateTime, Local}; +use clap::Clap; +use tabwriter::TabWriter; + +use crate::container::Container; + +/// Empty struct for list command +#[derive(Clap, Debug)] +pub struct List {} + +impl List { + /// lists all existing containers + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + let root_path = fs::canonicalize(root_path)?; + let mut content = String::new(); + // all containers' data is stored in their respective dir in root directory + // so we iterate through each and print the various info + for container_dir in fs::read_dir(root_path)? { + let container_dir = container_dir?.path(); + let state_file = container_dir.join("state.json"); + if !state_file.exists() { + continue; + } + + let container = Container::load(container_dir)?.refresh_status()?; + let pid = if let Some(pid) = container.pid() { + pid.to_string() + } else { + "".to_owned() + }; + + let user_name = if let Some(creator) = container.creator() { + creator + } else { + OsString::new() + }; + + let created = if let Some(utc) = container.created() { + let local: DateTime = DateTime::from(utc); + local.to_rfc3339_opts(chrono::SecondsFormat::Secs, false) + } else { + "".to_owned() + }; + + content.push_str(&format!( + "{}\t{}\t{}\t{}\t{}\t{}\n", + container.id(), + pid, + container.status(), + container.bundle(), + created, + user_name.to_string_lossy() + )); + } + + let mut tab_writer = TabWriter::new(io::stdout()); + writeln!(&mut tab_writer, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tCREATOR")?; + write!(&mut tab_writer, "{}", content)?; + tab_writer.flush()?; + + Ok(()) + } +} diff --git a/src/logger.rs b/src/logger.rs index 0464dbb03d..239b8c41c2 100644 --- a/src/logger.rs +++ b/src/logger.rs @@ -1,3 +1,5 @@ +//! Default Youki Logger + use std::env; use std::io::{stderr, Write}; use std::path::PathBuf; @@ -9,67 +11,68 @@ use std::{ use anyhow::Result; use log::{LevelFilter, Log, Metadata, Record}; use once_cell::sync::OnceCell; -use serde_json::json; +/// Public global variables to access logger and logfile pub static YOUKI_LOGGER: OnceCell = OnceCell::new(); pub static LOG_FILE: OnceCell> = OnceCell::new(); -pub fn init(container_id: &str, log_file: Option) -> Result<()> { +/// If in debug mode, default level is debug to get maximum logging +#[cfg(debug_assertions)] +const DEFAULT_LOG_LEVEL: LevelFilter = LevelFilter::Debug; + +/// If not in debug mode, default level is warn to get important logs +#[cfg(not(debug_assertions))] +const DEFAULT_LOG_LEVEL: LevelFilter = LevelFilter::Warn; + +/// Initialize the logger, must be called before accessing the logger +/// Multiple parts might call this at once, but the actual initialization +/// is done only once due to use of OnceCell +pub fn init(log_file: Option) -> Result<()> { + // If file exists, ignore, else create and open the file let _log_file = LOG_FILE.get_or_init(|| -> Option { - if let Ok(docker_root) = env::var("YOUKI_MODE") { - if let Some(log_file_path) = &log_file { - OpenOptions::new() - .create(true) - .write(true) - .truncate(false) - .open(log_file_path) - .expect("fail opening log file "); - }; + // set the log level if specified in env variable or set to default + let level_filter = if let Ok(log_level_str) = env::var("YOUKI_LOG_LEVEL") { + LevelFilter::from_str(&log_level_str).unwrap_or(DEFAULT_LOG_LEVEL) + } else { + DEFAULT_LOG_LEVEL + }; - let mut log_file_path = PathBuf::from(&docker_root); - log_file_path.push(container_id); - log_file_path.push(format!("{}-json.log", container_id)); + // Create a new logger, or get existing if already created + let logger = YOUKI_LOGGER.get_or_init(|| YoukiLogger::new(level_filter.to_level())); - let level_filter = if let Ok(log_level_str) = env::var("YOUKI_LOG_LEVEL") { - LevelFilter::from_str(&log_level_str).unwrap_or(LevelFilter::Warn) - } else { - LevelFilter::Warn - }; - let logger = YOUKI_LOGGER.get_or_init(|| YoukiLogger::new(level_filter.to_level())); - log::set_logger(logger) - .map(|()| log::set_max_level(level_filter)) - .unwrap(); + log::set_logger(logger) + .map(|()| log::set_max_level(level_filter)) + .expect("set logger failed"); + + // Create and open log file + log_file.as_ref().map(|log_file_path| { OpenOptions::new() .create(true) .write(true) .truncate(false) .open(log_file_path) - .map_err(|e| eprintln!("{:?}", e)) - .ok() - } else { - log_file.map(|log_file_path| { - OpenOptions::new() - .create(true) - .write(true) - .truncate(false) - .open(log_file_path) - .expect("fail opening log file ") - }) - } + .expect("failed opening log file ") + }) }); Ok(()) } + +/// Youki's custom Logger pub struct YoukiLogger { + /// Indicates level up to which logs are to be printed level: Option, } impl YoukiLogger { + /// Create new logger pub fn new(level: Option) -> Self { Self { level } } } +/// Implements Log interface given by log crate, so we can use its functionality impl Log for YoukiLogger { + /// Check if level of given log is enabled or not fn enabled(&self, metadata: &Metadata) -> bool { if let Some(level) = self.level { metadata.level() <= level @@ -78,33 +81,41 @@ impl Log for YoukiLogger { } } + /// Function to carry out logging fn log(&self, record: &Record) { if self.enabled(record.metadata()) { let log_msg = match (record.file(), record.line()) { - (Some(file), Some(line)) => json!({ - "log": format!("[{} {}:{}] {}\r\n", record.level(), file, line, record.args()), - "stream": "stdout", - "time": chrono::Local::now().to_rfc3339() - }), - (_, _) => json!({ - "log": format!("[{}] {}\r\n", record.level(), record.args()), - "stream": "stdout", - "time": chrono::Local::now().to_rfc3339() - }), + (Some(file), Some(line)) => format!( + "[{} {}:{}] {} {}\r", + record.level(), + file, + line, + chrono::Local::now().to_rfc3339(), + record.args() + ), + (_, _) => format!( + "[{}] {} {}\r", + record.level(), + chrono::Local::now().to_rfc3339(), + record.args() + ), }; + + // if log file is set, write to it, else write to stderr if let Some(mut log_file) = LOG_FILE.get().unwrap().as_ref() { - let _ = writeln!(log_file, "{}", log_msg.to_string()); + let _ = writeln!(log_file, "{}", log_msg); } else { - let _ = writeln!(stderr(), "{}", log_msg.to_string()); + let _ = writeln!(stderr(), "{}", log_msg); } } } + /// Flush logs to file fn flush(&self) { if let Some(mut log_file) = LOG_FILE.get().unwrap().as_ref() { log_file.flush().expect("Failed to flush"); } else { - stderr().flush().expect("Faild to flush"); + stderr().flush().expect("Failed to flush"); } } } diff --git a/src/main.rs b/src/main.rs index a1ddff8bfc..26f72e6ddd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,15 +5,21 @@ use std::fs; use std::path::PathBuf; -use anyhow::{bail, Result}; +use anyhow::Result; use clap::Clap; -use nix::sys::signal as nix_signal; -use youki::container::{Container, ContainerStatus}; use youki::create; -use youki::signal; +use youki::delete; +use youki::exec; +use youki::info; +use youki::kill; +use youki::list; +use youki::pause; +use youki::resume; +use youki::rootless::should_use_rootless; +use youki::spec_json; use youki::start; -use youki::{cgroups::Manager, command::linux::LinuxCommand}; +use youki::state; /// High-level commandline option definition /// This takes global options as well as individual commands as specified in [OCI runtime-spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) @@ -28,27 +34,14 @@ struct Opts { log: Option, #[clap(long)] log_format: Option, + /// Enable systemd cgroup manager, rather then use the cgroupfs directly. + #[clap(short, long)] + systemd_cgroup: bool, /// command to actually manage container #[clap(subcommand)] subcmd: SubCommand, } -#[derive(Clap, Debug)] -pub struct Kill { - container_id: String, - signal: String, -} - -#[derive(Clap, Debug)] -pub struct Delete { - container_id: String, -} - -#[derive(Clap, Debug)] -pub struct StateArgs { - pub container_id: String, -} - /// Subcommands accepted by Youki, confirming with [OCI runtime-spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) /// Also for a short information, check [runc commandline documentation](https://github.com/opencontainers/runc/blob/master/man/runc.8.md) #[derive(Clap, Debug)] @@ -58,23 +51,23 @@ enum SubCommand { #[clap(version = "0.0.1", author = "utam0k ")] Start(start::Start), #[clap(version = "0.0.1", author = "utam0k ")] - Kill(Kill), + Exec(exec::Exec), #[clap(version = "0.0.1", author = "utam0k ")] - Delete(Delete), + Kill(kill::Kill), #[clap(version = "0.0.1", author = "utam0k ")] - State(StateArgs), -} - -impl SubCommand { - fn get_container_id(&self) -> &String { - match &self { - SubCommand::Create(create) => &create.container_id, - SubCommand::Start(start) => &start.container_id, - SubCommand::Delete(delete) => &delete.container_id, - SubCommand::Kill(kill) => &kill.container_id, - SubCommand::State(state_args) => &state_args.container_id, - } - } + Delete(delete::Delete), + #[clap(version = "0.0.1", author = "utam0k ")] + State(state::State), + #[clap(version = "0.0.1", author = "utam0k ")] + Info(info::Info), + #[clap(version = "0.0.1", author = "utam0k ")] + Spec(spec_json::SpecJson), + #[clap(version = "0.0.1", author = "utam0k ")] + List(list::List), + #[clap(version = "0.0.1", author = "utam0k ")] + Pause(pause::Pause), + #[clap(version = "0.0.1", author = "utam0k ")] + Resume(resume::Resume), } /// This is the entry point in the container runtime. The binary is run by a high-level container runtime, @@ -82,89 +75,30 @@ impl SubCommand { fn main() -> Result<()> { let opts = Opts::parse(); - // debug mode for developer - if matches!(opts.subcmd, SubCommand::Create(_)) { - #[cfg(debug_assertions)] - std::env::set_var("YOUKI_MODE", "/var/lib/docker/containers/"); - #[cfg(debug_assertions)] - std::env::set_var("YOUKI_LOG_LEVEL", "debug"); - } - - if let Err(e) = youki::logger::init(opts.subcmd.get_container_id().as_str(), opts.log) { - log::warn!("log init failed: {:?}", e); + if let Err(e) = youki::logger::init(opts.log) { + eprintln!("log init failed: {:?}", e); } - let root_path = PathBuf::from(&opts.root); + let root_path = if should_use_rootless() && opts.root.eq(&PathBuf::from("/run/youki")) { + PathBuf::from("/tmp/rootless") + } else { + PathBuf::from(&opts.root) + }; fs::create_dir_all(&root_path)?; + let systemd_cgroup = opts.systemd_cgroup; + match opts.subcmd { - SubCommand::Create(create) => create.exec(root_path, LinuxCommand), + SubCommand::Create(create) => create.exec(root_path, systemd_cgroup), SubCommand::Start(start) => start.exec(root_path), - SubCommand::Kill(kill) => { - // resolves relative paths, symbolic links etc. and get complete path - let root_path = fs::canonicalize(root_path)?; - // state of container is stored in a directory named as container id inside - // root directory given in commandline options - let container_root = root_path.join(&kill.container_id); - if !container_root.exists() { - bail!("{} doesn't exists.", kill.container_id) - } - - // load container state from json file, and check status of the container - // it might be possible that kill is invoked on a already stopped container etc. - let container = Container::load(container_root)?.refresh_status()?; - if container.can_kill() { - let sig = signal::from_str(kill.signal.as_str())?; - log::debug!("kill signal {} to {}", sig, container.pid().unwrap()); - nix_signal::kill(container.pid().unwrap(), sig)?; - container.update_status(ContainerStatus::Stopped)?.save()?; - std::process::exit(0) - } else { - bail!( - "{} counld not be killed because it was {:?}", - container.id(), - container.status() - ) - } - } - SubCommand::Delete(delete) => { - // state of container is stored in a directory named as container id inside - // root directory given in commandline options - let container_root = root_path.join(&delete.container_id); - if !container_root.exists() { - bail!("{} doesn't exists.", delete.container_id) - } - // load container state from json file, and check status of the container - // it might be possible that delete is invoked on a running container. - let container = Container::load(container_root)?.refresh_status()?; - if container.can_delete() { - if container.root.exists() { - // remove the directory storing container state - fs::remove_dir_all(&container.root)?; - - let spec = oci_spec::Spec::load("config.json")?; - // remove the cgroup created for the container - // check https://man7.org/linux/man-pages/man7/cgroups.7.html - // creating and removing cgroups section for more information on cgroups - - let cmanager = Manager::new(spec.linux.unwrap().cgroups_path)?; - cmanager.remove()?; - } - std::process::exit(0) - } else { - bail!( - "{} counld not be deleted because it was {:?}", - container.id(), - container.status() - ) - } - } - SubCommand::State(state_args) => { - let root_path = fs::canonicalize(root_path)?; - let container_root = root_path.join(state_args.container_id); - let container = Container::load(container_root)?.refresh_status()?; - println!("{}", serde_json::to_string_pretty(&container.state)?); - std::process::exit(0); - } + SubCommand::Exec(exec) => exec.exec(root_path), + SubCommand::Kill(kill) => kill.exec(root_path), + SubCommand::Delete(delete) => delete.exec(root_path, systemd_cgroup), + SubCommand::State(state) => state.exec(root_path), + SubCommand::Info(info) => info.exec(), + SubCommand::List(list) => list.exec(root_path), + SubCommand::Spec(spec) => spec.exec(), + SubCommand::Pause(pause) => pause.exec(root_path, systemd_cgroup), + SubCommand::Resume(resume) => return resume.exec(root_path, systemd_cgroup), } } diff --git a/src/namespaces.rs b/src/namespaces.rs index 6266a52be9..512e28e710 100644 --- a/src/namespaces.rs +++ b/src/namespaces.rs @@ -1,3 +1,12 @@ +//! Namespaces provide isolation of resources for processes at a kernel level. +//! The namespaces are: Mount (filesystem), +//! Process (processes in a namespace have two PIDs, one for the global PID, +//! which is used by the main system and the second one is for the child within the process tree), +//! Interprocess Communication (Control or communication between processes), +//! Network (which network devices can be seen by the processes in the namespace), User (User configs), +//! UTS (hostname and domain information, processes will think they're running on servers with different names), +//! Cgroup (Resource limits, execution priority etc.) + use anyhow::Result; use nix::{ fcntl, @@ -6,14 +15,12 @@ use nix::{ unistd::{self, Gid, Uid}, }; -use crate::{ - command::{linux::LinuxCommand, test::TestHelperCommand, Command}, -}; -use oci_spec::{LinuxNamespace, LinuxNamespaceType}; +use crate::command::{syscall::create_syscall, Syscall}; +use oci_spec::LinuxNamespace; pub struct Namespaces { spaces: Vec, - command: Box, + command: Box, pub clone_flags: CloneFlags, } @@ -26,11 +33,7 @@ impl From> for Namespaces { cf }, ); - let command: Box = if cfg!(test) { - Box::new(TestHelperCommand::default()) - } else { - Box::new(LinuxCommand) - }; + let command: Box = create_syscall(); Namespaces { spaces: namespaces, @@ -57,6 +60,7 @@ impl Namespaces { (space, fd) }) .collect(); + for &(space, fd) in &to_enter { self.command.set_ns(fd, space)?; unistd::close(fd)?; @@ -73,10 +77,13 @@ impl Namespaces { } } +#[cfg(test)] mod tests { + use oci_spec::LinuxNamespaceType; + use super::*; + use crate::command::test::TestHelperSyscall; - #[allow(dead_code)] fn gen_sample_linux_namespaces() -> Vec { vec![ LinuxNamespace { @@ -106,7 +113,7 @@ mod tests { fn test_namespaces_set_ns() { let sample_linux_namespaces = gen_sample_linux_namespaces(); let namespaces: Namespaces = sample_linux_namespaces.into(); - let test_command: &TestHelperCommand = namespaces.command.as_any().downcast_ref().unwrap(); + let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap(); assert!(namespaces.apply_setns().is_ok()); let mut setns_args: Vec<_> = test_command @@ -126,7 +133,7 @@ mod tests { let namespaces: Namespaces = sample_linux_namespaces.into(); assert!(namespaces.apply_unshare(CloneFlags::CLONE_NEWIPC).is_ok()); - let test_command: &TestHelperCommand = namespaces.command.as_any().downcast_ref().unwrap(); + let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap(); let mut unshare_args = test_command.get_unshare_args(); unshare_args.sort(); let mut expect = vec![CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID]; diff --git a/src/notify_socket.rs b/src/notify_socket.rs index 45f9eff398..63738b61b8 100644 --- a/src/notify_socket.rs +++ b/src/notify_socket.rs @@ -1,10 +1,11 @@ +use std::env; use std::io::prelude::*; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; -use std::path::Path; +use std::path::PathBuf; use anyhow::Result; -use nix::unistd::close; +use nix::unistd::{self, close}; pub const NOTIFY_FILE: &str = "notify.sock"; @@ -13,9 +14,8 @@ pub struct NotifyListener { } impl NotifyListener { - pub fn new(root: &Path) -> Result { - let _notify_file_path = root.join(NOTIFY_FILE); - let stream = UnixListener::bind("notify.sock")?; + pub fn new(socket_name: &str) -> Result { + let stream = UnixListener::bind(socket_name)?; Ok(Self { socket: stream }) } @@ -24,7 +24,7 @@ impl NotifyListener { Ok((mut socket, _addr)) => { let mut response = String::new(); socket.read_to_string(&mut response)?; - log::debug!("receive :{}", response); + log::debug!("received: {}", response); } Err(e) => println!("accept function failed: {:?}", e), } @@ -37,18 +37,25 @@ impl NotifyListener { } } -pub struct NotifySocket {} +pub struct NotifySocket { + path: PathBuf, +} impl NotifySocket { - pub fn new(_root: &Path) -> Result { - Ok(Self {}) + pub fn new>(socket_path: P) -> Self { + Self { + path: socket_path.into(), + } } pub fn notify_container_start(&mut self) -> Result<()> { - log::debug!("connection start"); - let mut stream = UnixStream::connect("notify.sock")?; + log::debug!("notify container start"); + let cwd = env::current_dir()?; + unistd::chdir(&*self.path.parent().unwrap())?; + let mut stream = UnixStream::connect(&self.path.file_name().unwrap())?; stream.write_all(b"start container")?; - log::debug!("write finish"); + log::debug!("notify finished"); + unistd::chdir(&*cwd)?; Ok(()) } diff --git a/src/pause.rs b/src/pause.rs new file mode 100644 index 0000000000..737d301272 --- /dev/null +++ b/src/pause.rs @@ -0,0 +1,49 @@ +//! Contains functionality of pause container command +use std::fs::canonicalize; +use std::path::PathBuf; + +use anyhow::{bail, Result}; +use clap::Clap; + +use crate::cgroups; +use crate::container::Container; +use crate::container::ContainerStatus; +use crate::utils; +use oci_spec::FreezerState; + +#[derive(Clap, Debug)] +pub struct Pause { + pub container_id: String, +} + +impl Pause { + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + log::debug!("start pausing container {}", self.container_id); + let root_path = canonicalize(root_path)?; + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + + let container = Container::load(container_root)?.refresh_status()?; + if !container.can_pause() { + bail!( + "{} could not be paused because it was {:?}", + self.container_id, + container.status() + ); + } + + let spec = container.spec()?; + let cgroups_path = + utils::get_cgroup_path(&spec.linux.unwrap().cgroups_path, &self.container_id); + let cmanager = cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?; + cmanager.freeze(FreezerState::Frozen)?; + + log::debug!("saving paused status"); + container.update_status(ContainerStatus::Paused).save()?; + + log::debug!("container {} paused", self.container_id); + Ok(()) + } +} diff --git a/src/cond.rs b/src/pipe.rs similarity index 76% rename from src/cond.rs rename to src/pipe.rs index 946c23077d..53518eee22 100644 --- a/src/cond.rs +++ b/src/pipe.rs @@ -1,18 +1,21 @@ +//! Unix pipe wrapper + use std::os::unix::io::RawFd; use anyhow::Result; use nix::fcntl::OFlag; use nix::unistd::{close, pipe2, read}; -pub struct Cond { +pub struct Pipe { rfd: RawFd, wfd: RawFd, } -impl Cond { - pub fn new() -> Result { +impl Pipe { + pub fn new() -> Result { + // Sets as close-on-execution let (rfd, wfd) = pipe2(OFlag::O_CLOEXEC)?; - Ok(Cond { rfd, wfd }) + Ok(Pipe { rfd, wfd }) } pub fn wait(&self) -> Result<()> { @@ -22,6 +25,7 @@ impl Cond { close(self.rfd)?; Ok(()) } + pub fn notify(&self) -> Result<()> { close(self.rfd)?; close(self.wfd)?; diff --git a/src/process/child.rs b/src/process/child.rs index b5de2ca71e..65db399210 100644 --- a/src/process/child.rs +++ b/src/process/child.rs @@ -1,5 +1,5 @@ -use std::io::Write; -use std::{io::Read, time::Duration}; +use std::io::ErrorKind; +use std::io::Read; use anyhow::{bail, Result}; use mio::unix::pipe; @@ -8,52 +8,70 @@ use mio::unix::pipe::Sender; use mio::{Events, Interest, Poll, Token}; use nix::unistd::Pid; +use super::parent::ParentChannel; +use super::{MAX_EVENTS, WAIT_FOR_INIT}; use crate::process::message::Message; +// Token is used to identify which socket generated an event const CHILD: Token = Token(1); + +/// Contains sending end of pipe for parent process, receiving end of pipe +/// for the init process and poller for that pub struct ChildProcess { - sender_for_parent: Sender, + parent_channel: ParentChannel, receiver: Option, poll: Option, } +// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process +// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1, +// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form +// a process point of view, init process is child of child process, which is child of original youki process. impl ChildProcess { - pub fn new(sender_for_parent: Sender) -> Result { + /// create a new Child process structure + pub fn new(parent_channel: ParentChannel) -> Result { Ok(Self { - sender_for_parent, + parent_channel, receiver: None, poll: None, }) } - pub fn setup_uds(&mut self) -> Result { + /// sets up sockets for init process + pub fn setup_pipe(&mut self) -> Result { + // create a new pipe let (sender, mut receiver) = pipe::new()?; + // create a new poll, and register the receiving end of pipe to it + // This will poll for the read events, so when data is written to sending end of the pipe, + // the receiving end will be readable and poll wil notify let poll = Poll::new()?; poll.registry() .register(&mut receiver, CHILD, Interest::READABLE)?; + self.receiver = Some(receiver); self.poll = Some(poll); Ok(sender) } - pub fn ready(&mut self, init_pid: Pid) -> Result<()> { - log::debug!( - "child send to parent {:?}", - (Message::ChildReady as u8).to_be_bytes() - ); - self.write_message_for_parent(Message::ChildReady)?; - self.sender_for_parent - .write_all(&(init_pid.as_raw()).to_be_bytes())?; + /// Indicate that child process has forked the init process to parent process + pub fn notify_parent(&mut self, init_pid: Pid) -> Result<()> { + self.parent_channel.send_init_pid(init_pid)?; + Ok(()) + } + + pub fn request_identifier_mapping(&mut self) -> Result<()> { + self.parent_channel.request_identifier_mapping()?; Ok(()) } - fn write_message_for_parent(&mut self, msg: Message) -> Result<()> { - self.sender_for_parent - .write_all(&(msg as u8).to_be_bytes())?; + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { + self.parent_channel.wait_for_mapping_ack()?; Ok(()) } + /// Wait for the init process to be ready pub fn wait_for_init_ready(&mut self) -> Result<()> { + // make sure pipe for init process is set up let receiver = self .receiver .as_mut() @@ -63,12 +81,26 @@ impl ChildProcess { .as_mut() .expect("Complete the setup of uds in advance."); - let mut events = Events::with_capacity(128); - poll.poll(&mut events, Some(Duration::from_millis(1000)))?; + // Create collection with capacity to store up to MAX_EVENTS events + let mut events = Events::with_capacity(MAX_EVENTS); + // poll the receiving end of pipe created for WAIT_FOR_INIT duration an event + poll.poll(&mut events, Some(WAIT_FOR_INIT))?; for event in events.iter() { + // check if the event token in PARENT + // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() + // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation if let CHILD = event.token() { + // read message from the init process let mut buf = [0; 1]; - receiver.read_exact(&mut buf)?; + match receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + } match Message::from(u8::from_be_bytes(buf)) { Message::InitReady => return Ok(()), msg => bail!("receive unexpected message {:?} in child process", msg), @@ -77,6 +109,10 @@ impl ChildProcess { unreachable!() } } - bail!("unexpected message.") + // should not reach here, as there should be a ready event from init within WAIT_FOR_INIT duration + unreachable!( + "No message received from init process within {} seconds", + WAIT_FOR_INIT.as_secs() + ); } } diff --git a/src/process/fork.rs b/src/process/fork.rs index dbe0802cbc..19d668984e 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -11,87 +11,118 @@ use init::InitProcess; use nix::sched; use nix::sys::wait::{waitpid, WaitStatus}; use nix::unistd; +use nix::unistd::Pid; -use crate::cgroups::Manager; +use crate::cgroups::common::CgroupManager; +use crate::container::Container; use crate::container::ContainerStatus; use crate::process::{child, init, parent, Process}; -use oci_spec; -use crate::utils; -use crate::{cond::Cond, container::Container}; +use crate::rootless::Rootless; +/// Function to perform the first fork for in order to run the container process pub fn fork_first>( - pid_file: Option

, - is_userns: bool, + init: bool, + pid_file: &Option

, + rootless: &Option, linux: &oci_spec::Linux, - container: &Container, - cmanager: &Manager, + container: Option<&Container>, + cmanager: Box, ) -> Result { - let ccond = Cond::new()?; + // create new parent process structure + let (mut parent, parent_channel) = parent::ParentProcess::new(rootless.clone())?; + // create a new child process structure with sending end of parent process + let mut child = child::ChildProcess::new(parent_channel)?; - let (mut parent, sender_for_parent) = parent::ParentProcess::new()?; - let child = child::ChildProcess::new(sender_for_parent)?; - - unsafe { - match unistd::fork()? { - unistd::ForkResult::Child => { - utils::set_name("rc-user")?; - - if let Some(ref r) = linux.resources { - if let Some(adj) = r.oom_score_adj { - let mut f = fs::File::create("/proc/self/oom_score_adj")?; - f.write_all(adj.to_string().as_bytes())?; - } - } - - if is_userns { - sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?; + // fork the process + match unsafe { unistd::fork()? } { + // in the child process + unistd::ForkResult::Child => { + // if Out-of-memory score adjustment is set in specification. + // set the score value for the current process + // check https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more information + if let Some(ref r) = linux.resources { + if let Some(adj) = r.oom_score_adj { + let mut f = fs::File::create("/proc/self/oom_score_adj")?; + f.write_all(adj.to_string().as_bytes())?; } + } - ccond.notify()?; + // if new user is specified in specification, this will be true + // and new namespace will be created, check https://man7.org/linux/man-pages/man7/user_namespaces.7.html + // for more information + if rootless.is_some() { + log::debug!("creating new user namespace"); + sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?; - Ok(Process::Child(child)) + // child needs to be dumpable, otherwise the non root parent is not + // allowed to write the uid/gid maps + prctl::set_dumpable(true).unwrap(); + child.request_identifier_mapping()?; + child.wait_for_mapping_ack()?; + prctl::set_dumpable(false).unwrap(); } - unistd::ForkResult::Parent { child } => { - ccond.wait()?; - cmanager.apply(&linux.resources.as_ref().unwrap(), child)?; + Ok(Process::Child(child)) + } + // in the parent process + unistd::ForkResult::Parent { child } => { + // wait for child to fork init process and report back its pid + let init_pid = parent.wait_for_child_ready(child)?; + log::debug!("init pid is {:?}", init_pid); + + cmanager.add_task(Pid::from_raw(init_pid))?; + if rootless.is_none() && linux.resources.is_some() && init { + cmanager.apply(&linux.resources.as_ref().unwrap())?; + } - let init_pid = parent.wait_for_child_ready()?; + if let Some(container) = container { + // update status and pid of the container process container - .update_status(ContainerStatus::Created)? + .update_status(ContainerStatus::Created) + .set_creator(nix::unistd::geteuid().as_raw()) .set_pid(init_pid) .save()?; + } - if let Some(pid_file) = pid_file { - fs::write(&pid_file, format!("{}", child))?; - } - Ok(Process::Parent(parent)) + // if file to write the pid to is specified, write pid of the child + if let Some(pid_file) = pid_file { + fs::write(&pid_file, format!("{}", child))?; } + + Ok(Process::Parent(parent)) } } } +/// Function to perform the second fork, which will spawn the actual container process pub fn fork_init(mut child_process: ChildProcess) -> Result { - let sender_for_child = child_process.setup_uds()?; - unsafe { - match unistd::fork()? { - unistd::ForkResult::Child => Ok(Process::Init(InitProcess::new(sender_for_child))), - unistd::ForkResult::Parent { child } => { - child_process.wait_for_init_ready()?; - child_process.ready(child)?; + // setup sockets for init process + let sender_for_child = child_process.setup_pipe()?; + // for the process into current process (C1) (which is child of first_fork) and init process + match unsafe { unistd::fork()? } { + // if it is child process, create new InitProcess structure and return + unistd::ForkResult::Child => Ok(Process::Init(InitProcess::new(sender_for_child))), + // in the forking process C1 + unistd::ForkResult::Parent { child } => { + // wait for init process to be ready + child_process.wait_for_init_ready()?; + // notify the parent process (original youki process) that init process is forked and ready + child_process.notify_parent(child)?; - match waitpid(child, None)? { - WaitStatus::Exited(pid, status) => { - // cmanager.remove()?; - log::debug!("exited pid: {:?}, status: {:?}", pid, status); - exit(status); - } - WaitStatus::Signaled(pid, status, _) => { - log::debug!("signaled pid: {:?}, status: {:?}", pid, status); - exit(0); - } - _ => bail!("abnormal exited!"), + // wait for the init process, which is container process, to change state + // check https://man7.org/linux/man-pages/man3/wait.3p.html for more information + match waitpid(child, None)? { + // if normally exited + WaitStatus::Exited(pid, status) => { + log::debug!("exited pid: {:?}, status: {:?}", pid, status); + exit(status); + } + // if terminated by a signal + WaitStatus::Signaled(pid, status, _) => { + log::debug!("signaled pid: {:?}, status: {:?}", pid, status); + exit(0); } + _ => bail!("abnormal exited!"), } } } diff --git a/src/process/init.rs b/src/process/init.rs index d8698952a8..4615cc4c16 100644 --- a/src/process/init.rs +++ b/src/process/init.rs @@ -1,18 +1,31 @@ -use std::io::Write; +use std::{io::Write, path::PathBuf}; use anyhow::Result; use mio::unix::pipe::Sender; +use nix::{ + sched, + unistd::{Gid, Uid}, +}; -use crate::process::message::Message; +use crate::{ + capabilities, command::Syscall, namespaces::Namespaces, process::message::Message, rootfs, +}; + +/// Contains sending end for pipe for the child process pub struct InitProcess { sender_for_child: Sender, } impl InitProcess { + /// create a new Init process structure pub fn new(sender_for_child: Sender) -> Self { Self { sender_for_child } } + /// Notify that this process is ready + // The child here is in perspective of overall hierarchy + // main youki process -> child process -> init process + // the child here does not mean child of the init process pub fn ready(&mut self) -> Result<()> { log::debug!( "init send to child {:?}", @@ -22,9 +35,43 @@ impl InitProcess { Ok(()) } + #[inline] fn write_message_for_child(&mut self, msg: Message) -> Result<()> { self.sender_for_child .write_all(&(msg as u8).to_be_bytes())?; Ok(()) } } + +/// setup hostname, rootfs for the container process +pub fn setup_init_process( + spec: &oci_spec::Spec, + command: &impl Syscall, + rootfs: PathBuf, + namespaces: &Namespaces, +) -> Result<()> { + let proc = &spec.process; + + command.set_hostname(spec.hostname.as_str())?; + if proc.no_new_privileges { + let _ = prctl::set_no_new_privileges(true); + } + + rootfs::prepare_rootfs( + &spec, + &rootfs, + namespaces + .clone_flags + .contains(sched::CloneFlags::CLONE_NEWUSER), + )?; + + // change the root of filesystem of the process to the rootfs + command.pivot_rootfs(&rootfs)?; + + command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; + capabilities::reset_effective(command)?; + if let Some(caps) = &proc.capabilities { + capabilities::drop_privileges(&caps, command)?; + } + Ok(()) +} diff --git a/src/process/message.rs b/src/process/message.rs index 8e9ffd5be4..386b4fb775 100644 --- a/src/process/message.rs +++ b/src/process/message.rs @@ -1,7 +1,10 @@ +/// Used as a wrapper for messages to be sent between child and parent processes #[derive(Debug)] pub enum Message { ChildReady = 0x00, InitReady = 0x01, + WriteMapping = 0x02, + MappingWritten = 0x03, } impl From for Message { @@ -9,6 +12,8 @@ impl From for Message { match from { 0x00 => Message::ChildReady, 0x01 => Message::InitReady, + 0x02 => Message::WriteMapping, + 0x03 => Message::MappingWritten, _ => panic!("unknown message."), } } diff --git a/src/process/mod.rs b/src/process/mod.rs index 01359c0a60..65b1b907ac 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -1,3 +1,8 @@ +//! Provides a thin wrapper around fork syscall, +//! with enums and functions specific to youki implemented + +use std::time::Duration; + pub mod fork; pub mod message; @@ -5,10 +10,21 @@ mod child; mod init; mod parent; -pub use init::InitProcess; +pub use init::{setup_init_process, InitProcess}; +/// Used to describe type of process after fork. +/// Parent and child processes mean the same thing as in a normal fork call +/// InitProcess is specifically used to indicate the process which will run the command of container pub enum Process { Parent(parent::ParentProcess), Child(child::ChildProcess), Init(init::InitProcess), } +/// Maximum event capacity of polling +const MAX_EVENTS: usize = 128; +/// Time to wait when polling for message from child process +const WAIT_FOR_CHILD: Duration = Duration::from_secs(5); +/// Time to wait when polling for message from init process +const WAIT_FOR_INIT: Duration = Duration::from_millis(1000); +/// Time to wait when polling for mapping ack from parent +const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3); diff --git a/src/process/parent.rs b/src/process/parent.rs index fa48d5bdb1..bd1fe6d2f0 100644 --- a/src/process/parent.rs +++ b/src/process/parent.rs @@ -1,46 +1,252 @@ -use std::{io::Read, time::Duration}; +use std::io::ErrorKind; +use std::io::Read; +use std::io::Write; +use std::path::Path; +use std::process::Command; +use super::{MAX_EVENTS, WAIT_FOR_CHILD}; +use crate::process::message::Message; +use crate::process::WAIT_FOR_MAPPING; +use crate::rootless::Rootless; +use crate::utils; +use anyhow::Context; use anyhow::{bail, Result}; use mio::unix::pipe; use mio::unix::pipe::{Receiver, Sender}; use mio::{Events, Interest, Poll, Token}; +use nix::unistd::Pid; +use oci_spec::LinuxIdMapping; -use crate::process::message::Message; - +// Token is used to identify which socket generated an event const PARENT: Token = Token(0); + +/// Contains receiving end of pipe to child process and a poller for that. pub struct ParentProcess { + child_channel: ChildChannel, +} + +// Poll is used to register and listen for various events +// by registering it with an event source such as receiving end of a pipe +impl ParentProcess { + /// Create new Parent process structure + pub fn new(rootless: Option) -> Result<(Self, ParentChannel)> { + let (parent_channel, child_channel) = Self::setup_pipes(rootless)?; + let parent = Self { child_channel }; + + Ok((parent, parent_channel)) + } + + fn setup_pipes(rootless: Option) -> Result<(ParentChannel, ChildChannel)> { + let (send_to_parent, receive_from_child) = pipe::new()?; + let (send_to_child, receive_from_parent) = pipe::new()?; + + let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?; + let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?; + + Ok((parent_channel, child_channel)) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + let init_pid = self.child_channel.wait_for_child_ready(child_pid)?; + Ok(init_pid) + } +} + +// Channel for communicating with the parent +pub struct ParentChannel { + sender: Sender, receiver: Receiver, poll: Poll, } -impl ParentProcess { - pub fn new() -> Result<(Self, Sender)> { - let (sender, mut receiver) = pipe::new()?; +impl ParentChannel { + fn new(sender: Sender, mut receiver: Receiver) -> Result { let poll = Poll::new()?; poll.registry() .register(&mut receiver, PARENT, Interest::READABLE)?; - Ok((Self { receiver, poll }, sender)) + Ok(Self { + sender, + receiver, + poll, + }) + } + + pub fn send_init_pid(&mut self, pid: Pid) -> Result<()> { + // write ChildReady message to the pipe to parent + log::debug!("[child to parent] sending init pid ({:?})", pid); + self.write_message(Message::ChildReady)?; + // write pid of init process which is forked by child process to the pipe, + // Pid in nix::unistd is type alias of SessionId which itself is alias of i32 + self.sender.write_all(&(pid.as_raw()).to_be_bytes())?; + Ok(()) } - pub fn wait_for_child_ready(&mut self) -> Result { - let mut events = Events::with_capacity(128); - self.poll.poll(&mut events, Some(Duration::from_secs(5)))?; + // requests the parent to write the id mappings for the child process + // this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn request_identifier_mapping(&mut self) -> Result<()> { + log::debug!("[child to parent] request identifier mapping"); + self.write_message(Message::WriteMapping)?; + Ok(()) + } + + // wait until the parent process has finished writing the id mappings + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { + let mut events = Events::with_capacity(MAX_EVENTS); + log::debug!("waiting for ack from parent"); + + self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?; for event in events.iter() { - if let PARENT = event.token() { + if event.token() == PARENT { let mut buf = [0; 1]; - self.receiver.read_exact(&mut buf)?; + match self.receiver.read_exact(&mut buf) { + Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + } + match Message::from(u8::from_be_bytes(buf)) { - Message::ChildReady => { - let mut buf = [0; 4]; - self.receiver.read_exact(&mut buf)?; - return Ok(i32::from_be_bytes(buf)); + Message::MappingWritten => return Ok(()), + msg => bail!("receive unexpected message {:?} in child process", msg), + } + } + } + unreachable!("timed out waiting for mapping ack from parent") + } + + #[inline] + fn write_message(&mut self, msg: Message) -> Result<()> { + self.sender.write_all(&(msg as u8).to_be_bytes())?; + Ok(()) + } +} + +struct ChildChannel { + sender: Sender, + receiver: Receiver, + poll: Poll, + rootless: Option, +} + +impl ChildChannel { + fn new(sender: Sender, mut receiver: Receiver, rootless: Option) -> Result { + let poll = Poll::new()?; + poll.registry() + .register(&mut receiver, PARENT, Interest::READABLE)?; + Ok(Self { + sender, + receiver, + poll, + rootless, + }) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + // Create collection with capacity to store up to MAX_EVENTS events + let mut events = Events::with_capacity(MAX_EVENTS); + loop { + // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event + self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?; + for event in events.iter() { + // check if the event token in PARENT + // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() + // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation + if let PARENT = event.token() { + // read data from pipe + let mut buf = [0; 1]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => { + break; + } + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + }; + // convert to Message wrapper + match Message::from(u8::from_be_bytes(buf)) { + Message::ChildReady => { + log::debug!("received child ready message"); + // read pid of init process forked by child, 4 bytes as the type is i32 + let mut buf = [0; 4]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + } + return Ok(i32::from_be_bytes(buf)); + } + Message::WriteMapping => { + log::debug!("write mapping for pid {:?}", child_pid); + utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?; + self.write_uid_mapping(child_pid)?; + self.write_gid_mapping(child_pid)?; + self.notify_mapping_written()?; + } + msg => bail!("receive unexpected message {:?} in parent process", msg), } - msg => bail!("receive unexpected message {:?} in parent process", msg), + } else { + // as the poll is registered with only parent token + unreachable!() } - } else { - unreachable!() } } - bail!("unexpected message.") } + + fn notify_mapping_written(&mut self) -> Result<()> { + self.sender + .write_all(&(Message::MappingWritten as u8).to_be_bytes())?; + Ok(()) + } + + fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/uid_map", target_pid), + &rootless.uid_mappings, + rootless.newuidmap.as_deref(), + ) + } + + fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/gid_map", target_pid), + &rootless.gid_mappings, + rootless.newgidmap.as_deref(), + ) + } +} + +fn write_id_mapping( + map_file: &str, + mappings: &[LinuxIdMapping], + map_binary: Option<&Path>, +) -> Result<()> { + let mappings: Vec = mappings + .iter() + .map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size)) + .collect(); + if mappings.len() == 1 { + utils::write_file(map_file, mappings.first().unwrap())?; + } else { + Command::new(map_binary.unwrap()) + .args(mappings) + .output() + .with_context(|| format!("failed to execute {:?}", map_binary))?; + } + + Ok(()) } diff --git a/src/resume.rs b/src/resume.rs new file mode 100644 index 0000000000..9735f81e56 --- /dev/null +++ b/src/resume.rs @@ -0,0 +1,49 @@ +//! Contains functionality of resume container command +use std::fs::canonicalize; +use std::path::PathBuf; + +use anyhow::{bail, Result}; +use clap::Clap; + +use crate::cgroups; +use crate::container::Container; +use crate::container::ContainerStatus; +use crate::utils; +use oci_spec::FreezerState; + +#[derive(Clap, Debug)] +pub struct Resume { + pub container_id: String, +} + +impl Resume { + pub fn exec(&self, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { + log::debug!("start resuming container {}", self.container_id); + let root_path = canonicalize(root_path)?; + let container_root = root_path.join(&self.container_id); + if !container_root.exists() { + bail!("{} doesn't exist.", self.container_id) + } + + let container = Container::load(container_root)?.refresh_status()?; + if !container.can_resume() { + bail!( + "{} could not be resumed because it was {:?}", + self.container_id, + container.status() + ); + } + + let spec = container.spec()?; + let cgroups_path = + utils::get_cgroup_path(&spec.linux.unwrap().cgroups_path, &self.container_id); + let cmanager = cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?; + cmanager.freeze(FreezerState::Thawed)?; + + log::debug!("saving running status"); + container.update_status(ContainerStatus::Running).save()?; + + log::debug!("container {} resumed", self.container_id); + Ok(()) + } +} diff --git a/src/rootfs.rs b/src/rootfs.rs index 83705f9105..81274b1ae8 100644 --- a/src/rootfs.rs +++ b/src/rootfs.rs @@ -1,3 +1,6 @@ +//! During kernel initialization, a minimal replica of the ramfs filesystem is loaded, called rootfs. +//! Most systems mount another filesystem over it + use std::fs::OpenOptions; use std::fs::{canonicalize, create_dir_all, remove_file}; use std::os::unix::fs::symlink; @@ -13,8 +16,8 @@ use nix::sys::stat::{mknod, umask}; use nix::unistd::{chdir, chown, close, getcwd}; use nix::unistd::{Gid, Uid}; -use oci_spec::{LinuxDevice, LinuxDeviceType, Mount, Spec}; use crate::utils::PathBufExt; +use oci_spec::{LinuxDevice, LinuxDeviceType, Mount, Spec}; pub fn prepare_rootfs(spec: &Spec, rootfs: &Path, bind_devices: bool) -> Result<()> { let mut flags = MsFlags::MS_REC; @@ -43,7 +46,7 @@ pub fn prepare_rootfs(spec: &Spec, rootfs: &Path, bind_devices: bool) -> Result< let ml = &spec.linux.as_ref().unwrap().mount_label; if m.typ == "cgroup" { // skip - log::warn!("A feature of cgoup is unimplemented."); + log::warn!("A feature of cgroup is unimplemented."); } else if m.destination == PathBuf::from("/dev") { mount_to_container(&m, rootfs, flags & !MsFlags::MS_RDONLY, &data, &ml)?; } else { diff --git a/src/rootless.rs b/src/rootless.rs new file mode 100644 index 0000000000..3841260dad --- /dev/null +++ b/src/rootless.rs @@ -0,0 +1,139 @@ +use std::{env, path::PathBuf}; + +use anyhow::{bail, Result}; +use nix::sched::CloneFlags; +use oci_spec::{Linux, LinuxIdMapping, Mount, Spec}; + +use crate::namespaces::Namespaces; + +#[derive(Debug, Clone)] +pub struct Rootless { + /// Location of the newuidmap binary + pub newuidmap: Option, + /// Location of the newgidmap binary + pub newgidmap: Option, + /// Mappings for user ids + pub uid_mappings: Vec, + /// Mappings for group ids + pub gid_mappings: Vec, +} + +impl From<&Linux> for Rootless { + fn from(linux: &Linux) -> Self { + Self { + newuidmap: None, + newgidmap: None, + uid_mappings: linux.uid_mappings.clone(), + gid_mappings: linux.gid_mappings.clone(), + } + } +} + +pub fn detect_rootless(spec: &Spec) -> Result> { + let linux = spec.linux.as_ref().unwrap(); + + let rootless = if should_use_rootless() { + log::debug!("rootless container should be created"); + log::warn!( + "resource constraints and multi id mapping is unimplemented for rootless containers" + ); + validate(spec)?; + let mut rootless = Rootless::from(linux); + if let Some((uid_binary, gid_binary)) = lookup_map_binaries(linux)? { + rootless.newuidmap = Some(uid_binary); + rootless.newgidmap = Some(gid_binary); + } + Some(rootless) + } else { + None + }; + + Ok(rootless) +} + +/// Checks if rootless mode should be used +pub fn should_use_rootless() -> bool { + if !nix::unistd::geteuid().is_root() { + return true; + } + + if let Ok("true") = std::env::var("YOUKI_USE_ROOTLESS").as_deref() { + return true; + } + + false +} + +/// Validates that the spec contains the required information for +/// running in rootless mode +pub fn validate(spec: &Spec) -> Result<()> { + let linux = spec.linux.as_ref().unwrap(); + + if linux.uid_mappings.is_empty() { + bail!("rootless containers require at least one uid mapping"); + } + + if linux.gid_mappings.is_empty() { + bail!("rootless containers require at least one gid mapping") + } + + let namespaces: Namespaces = linux.namespaces.clone().into(); + if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) { + bail!("rootless containers require the specification of a user namespace"); + } + + validate_mounts(&spec.mounts, &linux.uid_mappings, &linux.gid_mappings)?; + + Ok(()) +} + +fn validate_mounts( + mounts: &[Mount], + uid_mappings: &[LinuxIdMapping], + gid_mappings: &[LinuxIdMapping], +) -> Result<()> { + for mount in mounts { + for opt in &mount.options { + if opt.starts_with("uid=") && !is_id_mapped(&opt[4..], uid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + + if opt.starts_with("gid=") && !is_id_mapped(&opt[4..], gid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + } + } + + Ok(()) +} + +fn is_id_mapped(id: &str, mappings: &[LinuxIdMapping]) -> Result { + let id = id.parse::()?; + Ok(mappings + .iter() + .any(|m| id >= m.container_id && id <= m.container_id + m.size)) +} + +/// Looks up the location of the newuidmap and newgidmap binaries which +/// are required to write multiple user/group mappings +pub fn lookup_map_binaries(spec: &Linux) -> Result> { + if spec.uid_mappings.len() == 1 && spec.uid_mappings.len() == 1 { + return Ok(None); + } + + let uidmap = lookup_map_binary("newuidmap")?; + let gidmap = lookup_map_binary("newgidmap")?; + + match (uidmap, gidmap) { + (Some(newuidmap), Some(newgidmap)) => Ok(Some((newuidmap, newgidmap))), + _ => bail!("newuidmap/newgidmap binaries could not be found in path. This is required if multiple id mappings are specified"), + } +} + +fn lookup_map_binary(binary: &str) -> Result> { + let paths = env::var("PATH")?; + Ok(paths + .split_terminator(':') + .find(|p| PathBuf::from(p).join(binary).exists()) + .map(PathBuf::from)) +} diff --git a/src/signal.rs b/src/signal.rs index 9b2c677007..30b151f137 100644 --- a/src/signal.rs +++ b/src/signal.rs @@ -1,40 +1,101 @@ +//! Returns *nix signal enum value from passed string + use anyhow::{bail, Result}; use nix::sys::signal::Signal; -pub fn from_str(signal: &str) -> Result { - use Signal::*; - Ok(match signal.to_ascii_uppercase().as_str() { - "1" | "HUP" | "SIGHUP" => Signal::SIGHUP, - "2" | "INT" | "SIGINT" => Signal::SIGINT, - "3" | "QUIT" | "SIGQUIT" => Signal::SIGQUIT, - "4" | "ILL" | "SIGILL" => Signal::SIGILL, - "5" | "BUS" | "SIGBUS" => Signal::SIGBUS, - "6" | "ABRT" | "IOT" | "SIGABRT" | "SIGIOT" => Signal::SIGABRT, - "7" | "TRAP" | "SIGTRAP" => Signal::SIGTRAP, - "8" | "FPE" | "SIGFPE" => Signal::SIGFPE, - "9" | "KILL" | "SIGKILL" => Signal::SIGKILL, - "10" | "USR1" | "SIGUSR1" => Signal::SIGUSR1, - "11" | "SEGV" | "SIGSEGV" => SIGSEGV, - "12" | "USR2" | "SIGUSR2" => SIGUSR2, - "13" | "PIPE" | "SIGPIPE" => SIGPIPE, - "14" | "ALRM" | "SIGALRM" => SIGALRM, - "15" | "TERM" | "SIGTERM" => SIGTERM, - "16" | "STKFLT" | "SIGSTKFLT" => SIGSTKFLT, - "17" | "CHLD" | "SIGCHLD" => SIGCHLD, - "18" | "CONT" | "SIGCONT" => SIGCONT, - "19" | "STOP" | "SIGSTOP" => SIGSTOP, - "20" | "TSTP" | "SIGTSTP" => SIGTSTP, - "21" | "TTIN" | "SIGTTIN" => SIGTTIN, - "22" | "TTOU" | "SIGTTOU" => SIGTTOU, - "23" | "URG" | "SIGURG" => SIGURG, - "24" | "XCPU" | "SIGXCPU" => SIGXCPU, - "25" | "XFSZ" | "SIGXFSZ" => SIGXFSZ, - "26" | "VTALRM" | "SIGVTALRM" => SIGVTALRM, - "27" | "PROF" | "SIGPROF" => SIGPROF, - "28" | "WINCH" | "SIGWINCH" => SIGWINCH, - "29" | "IO" | "SIGIO" => SIGIO, - "30" | "PWR" | "SIGPWR" => SIGPWR, - "31" | "SYS" | "SIGSYS" => SIGSYS, - _ => bail! {"{} is not a valid signal", signal}, - }) +pub trait ToSignal { + fn to_signal(&self) -> Result; +} + +impl ToSignal for String { + fn to_signal(&self) -> Result { + use Signal::*; + Ok(match self.to_ascii_uppercase().as_str() { + "1" | "HUP" | "SIGHUP" => SIGHUP, + "2" | "INT" | "SIGINT" => SIGINT, + "3" | "QUIT" | "SIGQUIT" => SIGQUIT, + "4" | "ILL" | "SIGILL" => SIGILL, + "5" | "BUS" | "SIGBUS" => SIGBUS, + "6" | "ABRT" | "IOT" | "SIGABRT" | "SIGIOT" => SIGABRT, + "7" | "TRAP" | "SIGTRAP" => SIGTRAP, + "8" | "FPE" | "SIGFPE" => SIGFPE, + "9" | "KILL" | "SIGKILL" => SIGKILL, + "10" | "USR1" | "SIGUSR1" => SIGUSR1, + "11" | "SEGV" | "SIGSEGV" => SIGSEGV, + "12" | "USR2" | "SIGUSR2" => SIGUSR2, + "13" | "PIPE" | "SIGPIPE" => SIGPIPE, + "14" | "ALRM" | "SIGALRM" => SIGALRM, + "15" | "TERM" | "SIGTERM" => SIGTERM, + "16" | "STKFLT" | "SIGSTKFLT" => SIGSTKFLT, + "17" | "CHLD" | "SIGCHLD" => SIGCHLD, + "18" | "CONT" | "SIGCONT" => SIGCONT, + "19" | "STOP" | "SIGSTOP" => SIGSTOP, + "20" | "TSTP" | "SIGTSTP" => SIGTSTP, + "21" | "TTIN" | "SIGTTIN" => SIGTTIN, + "22" | "TTOU" | "SIGTTOU" => SIGTTOU, + "23" | "URG" | "SIGURG" => SIGURG, + "24" | "XCPU" | "SIGXCPU" => SIGXCPU, + "25" | "XFSZ" | "SIGXFSZ" => SIGXFSZ, + "26" | "VTALRM" | "SIGVTALRM" => SIGVTALRM, + "27" | "PROF" | "SIGPROF" => SIGPROF, + "28" | "WINCH" | "SIGWINCH" => SIGWINCH, + "29" | "IO" | "SIGIO" => SIGIO, + "30" | "PWR" | "SIGPWR" => SIGPWR, + "31" | "SYS" | "SIGSYS" => SIGSYS, + _ => bail! {"{} is not a valid signal", self}, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::sys::signal::Signal::*; + use std::collections::HashMap; + + #[test] + fn test_conversion_from_string() { + let mut test_sets = HashMap::new(); + test_sets.insert(SIGHUP, vec!["1", "HUP", "SIGHUP"]); + test_sets.insert(SIGINT, vec!["2", "INT", "SIGINT"]); + test_sets.insert(SIGQUIT, vec!["3", "QUIT", "SIGQUIT"]); + test_sets.insert(SIGILL, vec!["4", "ILL", "SIGILL"]); + test_sets.insert(SIGBUS, vec!["5", "BUS", "SIGBUS"]); + test_sets.insert(SIGABRT, vec!["6", "ABRT", "IOT", "SIGABRT", "SIGIOT"]); + test_sets.insert(SIGTRAP, vec!["7", "TRAP", "SIGTRAP"]); + test_sets.insert(SIGFPE, vec!["8", "FPE", "SIGFPE"]); + test_sets.insert(SIGKILL, vec!["9", "KILL", "SIGKILL"]); + test_sets.insert(SIGUSR1, vec!["10", "USR1", "SIGUSR1"]); + test_sets.insert(SIGSEGV, vec!["11", "SEGV", "SIGSEGV"]); + test_sets.insert(SIGUSR2, vec!["12", "USR2", "SIGUSR2"]); + test_sets.insert(SIGPIPE, vec!["13", "PIPE", "SIGPIPE"]); + test_sets.insert(SIGALRM, vec!["14", "ALRM", "SIGALRM"]); + test_sets.insert(SIGTERM, vec!["15", "TERM", "SIGTERM"]); + test_sets.insert(SIGSTKFLT, vec!["16", "STKFLT", "SIGSTKFLT"]); + test_sets.insert(SIGCHLD, vec!["17", "CHLD", "SIGCHLD"]); + test_sets.insert(SIGCONT, vec!["18", "CONT", "SIGCONT"]); + test_sets.insert(SIGSTOP, vec!["19", "STOP", "SIGSTOP"]); + test_sets.insert(SIGTSTP, vec!["20", "TSTP", "SIGTSTP"]); + test_sets.insert(SIGTTIN, vec!["21", "TTIN", "SIGTTIN"]); + test_sets.insert(SIGTTOU, vec!["22", "TTOU", "SIGTTOU"]); + test_sets.insert(SIGURG, vec!["23", "URG", "SIGURG"]); + test_sets.insert(SIGXCPU, vec!["24", "XCPU", "SIGXCPU"]); + test_sets.insert(SIGXFSZ, vec!["25", "XFSZ", "SIGXFSZ"]); + test_sets.insert(SIGVTALRM, vec!["26", "VTALRM", "SIGVTALRM"]); + test_sets.insert(SIGPROF, vec!["27", "PROF", "SIGPROF"]); + test_sets.insert(SIGWINCH, vec!["28", "WINCH", "SIGWINCH"]); + test_sets.insert(SIGIO, vec!["29", "IO", "SIGIO"]); + test_sets.insert(SIGPWR, vec!["30", "PWR", "SIGPWR"]); + test_sets.insert(SIGSYS, vec!["31", "SYS", "SIGSYS"]); + for (signal, strings) in test_sets { + for s in strings { + assert_eq!(signal, s.to_string().to_signal().unwrap()); + } + } + } + + #[test] + fn test_conversion_from_string_should_be_failed() { + assert!("invalid".to_string().to_signal().is_err()) + } } diff --git a/src/spec_json.rs b/src/spec_json.rs new file mode 100644 index 0000000000..38acc129b8 --- /dev/null +++ b/src/spec_json.rs @@ -0,0 +1,20 @@ +use anyhow::Result; +use clap::Clap; +use oci_spec::Spec; +use serde_json::to_writer_pretty; +use std::fs::File; + +/// Command generates a config.json +#[derive(Clap, Debug)] +pub struct SpecJson; + +/// spec Cli command +impl SpecJson { + pub fn exec(&self) -> Result<()> { + // get default values for Spec + let default_json: Spec = Default::default(); + // write data to config.json + to_writer_pretty(&File::create("config.json")?, &default_json)?; + Ok(()) + } +} diff --git a/src/start.rs b/src/start.rs index 1dc71d1072..8788221e45 100644 --- a/src/start.rs +++ b/src/start.rs @@ -1,3 +1,5 @@ +//! Starts execution of the container + use std::path::PathBuf; use anyhow::{bail, Result}; @@ -5,7 +7,7 @@ use clap::Clap; use nix::unistd; use crate::container::{Container, ContainerStatus}; -use crate::notify_socket::NotifySocket; +use crate::notify_socket::{NotifySocket, NOTIFY_FILE}; #[derive(Clap, Debug)] pub struct Start { @@ -16,12 +18,12 @@ impl Start { pub fn exec(&self, root_path: PathBuf) -> Result<()> { let container_root = root_path.join(&self.container_id); if !container_root.exists() { - bail!("{} doesn't exists.", self.container_id) + bail!("{} doesn't exist.", self.container_id) } let container = Container::load(container_root)?.refresh_status()?; if !container.can_start() { let err_msg = format!( - "{} counld not be started because it was {:?}", + "{} could not be started because it was {:?}", container.id(), container.status() ); @@ -31,10 +33,10 @@ impl Start { unistd::chdir(container.root.as_os_str())?; - let mut notify_socket = NotifySocket::new(&container.root)?; + let mut notify_socket = NotifySocket::new(&container.root.join(NOTIFY_FILE)); notify_socket.notify_container_start()?; - container.update_status(ContainerStatus::Running)?.save()?; + container.update_status(ContainerStatus::Running).save()?; Ok(()) } } diff --git a/src/state.rs b/src/state.rs new file mode 100644 index 0000000000..7be62193c8 --- /dev/null +++ b/src/state.rs @@ -0,0 +1,22 @@ +use std::fs; +use std::path::PathBuf; + +use anyhow::Result; +use clap::Clap; + +use crate::container::Container; + +#[derive(Clap, Debug)] +pub struct State { + pub container_id: String, +} + +impl State { + pub fn exec(&self, root_path: PathBuf) -> Result<()> { + let root_path = fs::canonicalize(root_path)?; + let container_root = root_path.join(&self.container_id); + let container = Container::load(container_root)?.refresh_status()?; + println!("{}", serde_json::to_string_pretty(&container.state)?); + std::process::exit(0); + } +} diff --git a/src/tty.rs b/src/tty.rs index e759ae888b..023853a3ac 100644 --- a/src/tty.rs +++ b/src/tty.rs @@ -1,47 +1,28 @@ +//! tty (teletype) for user-system interaction + use std::os::unix::fs::symlink; use std::os::unix::io::AsRawFd; use std::path::Path; +use anyhow::Context; use anyhow::{bail, Result}; use nix::errno::Errno; -use nix::fcntl; use nix::sys::socket; -use nix::sys::stat; +use nix::sys::uio; use nix::unistd::{close, setsid}; use crate::stdio; use crate::stdio::FileDescriptor; -pub fn ready(console_fd: FileDescriptor) -> Result<()> { - let openpty_result = nix::pty::openpty(None, None)?; - let data: &[u8] = b"/dev/ptmx"; - let iov = [nix::sys::uio::IoVec::from_slice(data)]; - let fds = [openpty_result.master]; - let cmsg = socket::ControlMessage::ScmRights(&fds); - socket::sendmsg( - console_fd.as_raw_fd(), - &iov, - &[cmsg], - socket::MsgFlags::empty(), - None, - )?; +// TODO: Handling when there isn't console-socket. - setsid()?; - if unsafe { libc::ioctl(openpty_result.slave, libc::TIOCSCTTY) } < 0 { - log::warn!("could not TIOCSCTTY"); - }; - let slave = FileDescriptor::from(openpty_result.slave); - stdio::connect_stdio(&slave, &slave, &slave).expect("could not dup tty to stderr"); - close(console_fd.as_raw_fd())?; - Ok(()) -} - -pub fn load_console_sockets( +pub fn setup_console_socket( container_dir: &Path, - console_socket: &str, -) -> Result<(FileDescriptor, FileDescriptor)> { - let csocket = "console-stdout"; - symlink(console_socket, container_dir.join(csocket))?; + console_socket_path: &Path, + socket_name: &str, +) -> Result { + let linked = container_dir.join(socket_name); + symlink(console_socket_path, &linked)?; let mut csocketfd = socket::socket( socket::AddressFamily::Unix, @@ -51,29 +32,118 @@ pub fn load_console_sockets( )?; csocketfd = match socket::connect( csocketfd, - &socket::SockAddr::Unix(socket::UnixAddr::new(&*csocket)?), + &socket::SockAddr::Unix(socket::UnixAddr::new(&*socket_name)?), ) { Err(e) => { if e != ::nix::Error::Sys(Errno::ENOENT) { - bail!("failed to open {}", csocket); + bail!("failed to open {}", socket_name); } -1 } Ok(()) => csocketfd, }; - let console = "console"; - let consolefd = match fcntl::open( - &*console, - fcntl::OFlag::O_NOCTTY | fcntl::OFlag::O_RDWR, - stat::Mode::empty(), - ) { - Err(e) => { - if e != ::nix::Error::Sys(Errno::ENOENT) { - bail!("failed to open {}", console); - } - -1 - } - Ok(fd) => fd, + Ok(csocketfd.into()) +} + +pub fn setup_console(console_fd: &FileDescriptor) -> Result<()> { + // You can also access pty master, but it is better to use the API. + // ref. https://github.com/containerd/containerd/blob/261c107ffc4ff681bc73988f64e3f60c32233b37/vendor/github.com/containerd/go-runc/console.go#L139-L154 + let openpty_result = + nix::pty::openpty(None, None).context("could not create pseudo terminal")?; + let pty_name: &[u8] = b"/dev/ptmx"; + let iov = [uio::IoVec::from_slice(pty_name)]; + let fds = [openpty_result.master]; + let cmsg = socket::ControlMessage::ScmRights(&fds); + socket::sendmsg( + console_fd.as_raw_fd(), + &iov, + &[cmsg], + socket::MsgFlags::empty(), + None, + ) + .context("failed to send pty master")?; + + setsid()?; + if unsafe { libc::ioctl(openpty_result.slave, libc::TIOCSCTTY) } < 0 { + log::warn!("could not TIOCSCTTY"); }; - Ok((csocketfd.into(), consolefd.into())) + let slave = FileDescriptor::from(openpty_result.slave); + stdio::connect_stdio(&slave, &slave, &slave).expect("could not dup tty to stderr"); + close(console_fd.as_raw_fd()).context("could not close console socket")?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::env; + use std::fs::{self, File}; + use std::os::unix::net::UnixListener; + use std::path::PathBuf; + + use serial_test::serial; + + use crate::utils::{create_temp_dir, TempDir}; + + const CONSOLE_SOCKET: &str = "console-socket"; + + fn setup(testname: &str) -> Result<(TempDir, PathBuf, PathBuf)> { + let testdir = create_temp_dir(testname)?; + let rundir_path = Path::join(&testdir, "run"); + let _ = fs::create_dir(&rundir_path)?; + let socket_path = Path::new(&rundir_path).join("socket"); + let _ = File::create(&socket_path); + env::set_current_dir(&testdir)?; + Ok((testdir, rundir_path, socket_path)) + } + + #[test] + #[serial] + fn test_setup_console_socket() { + let init = setup("test_setup_console_socket"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let lis = UnixListener::bind(Path::join(&testdir, "console-socket")); + assert!(lis.is_ok()); + let fd = setup_console_socket(&&rundir_path, &socket_path, CONSOLE_SOCKET); + assert!(fd.is_ok()); + assert_ne!(fd.unwrap().as_raw_fd(), -1); + } + + #[test] + #[serial] + fn test_setup_console_socket_empty() { + let init = setup("test_setup_console_socket_empty"); + assert!(init.is_ok()); + let (_testdir, rundir_path, socket_path) = init.unwrap(); + let fd = setup_console_socket(&rundir_path, &socket_path, CONSOLE_SOCKET); + assert!(fd.is_ok()); + assert_eq!(fd.unwrap().as_raw_fd(), -1); + } + + #[test] + #[serial] + fn test_setup_console_socket_invalid() { + let init = setup("test_setup_console_socket_invalid"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let _socket = File::create(Path::join(&testdir, "console-socket")); + assert!(_socket.is_ok()); + let fd = setup_console_socket(&rundir_path, &socket_path, CONSOLE_SOCKET); + assert!(fd.is_err()); + } + + #[test] + #[serial] + fn test_setup_console() { + let init = setup("test_setup_console"); + assert!(init.is_ok()); + let (testdir, rundir_path, socket_path) = init.unwrap(); + let lis = UnixListener::bind(Path::join(&testdir, "console-socket")); + assert!(lis.is_ok()); + let fd = setup_console_socket(&&rundir_path, &socket_path, CONSOLE_SOCKET); + let status = setup_console(&fd.unwrap()); + assert!(status.is_ok()); + } } diff --git a/src/utils.rs b/src/utils.rs index 39a7f6b66b..916e50d9a7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,8 +1,15 @@ +//! Utility functionality + +use std::env; use std::ffi::CString; +use std::fs::{self, File}; +use std::ops::Deref; use std::path::{Path, PathBuf}; +use std::time::Duration; +use anyhow::Context; use anyhow::{bail, Result}; -use nix::{env::clearenv, errno::Errno, unistd}; +use nix::unistd; pub trait PathBufExt { fn as_in_container(&self) -> Result; @@ -12,7 +19,7 @@ pub trait PathBufExt { impl PathBufExt for PathBuf { fn as_in_container(&self) -> Result { if self.is_relative() { - bail!("Relative path cannnot be converted to the path in the container.") + bail!("Relative path cannot be converted to the path in the container.") } else { let path_string = self.to_string_lossy().into_owned(); Ok(PathBuf::from(path_string[1..].to_string())) @@ -22,7 +29,7 @@ impl PathBufExt for PathBuf { fn join_absolute_path(&self, p: &Path) -> Result { if !p.is_absolute() && !p.as_os_str().is_empty() { bail!( - "connnot join {:?} because it is not the absolute path.", + "cannot join {:?} because it is not the absolute path.", p.display() ) } @@ -36,34 +43,119 @@ pub fn do_exec(path: impl AsRef, args: &[String], envs: &[String]) -> Resu .iter() .map(|s| CString::new(s.to_string()).unwrap_or_default()) .collect(); - let envs: Vec = envs - .iter() - .map(|s| CString::new(s.to_string()).unwrap_or_default()) - .collect(); - unsafe { - clearenv()?; - } - for e in envs { - putenv(&e)? - } + // clear env vars + env::vars().for_each(|(key, _value)| std::env::remove_var(key)); + // set env vars + envs.iter().for_each(|e| { + let mut split = e.split('='); + if let Some(key) = split.next() { + let value: String = split.collect::>().join("="); + env::set_var(key, value) + }; + }); unistd::execvp(&p, &a)?; Ok(()) } -#[inline] -fn putenv(string: &CString) -> nix::Result<()> { - let ptr = string.clone().into_raw(); - let res = unsafe { libc::putenv(ptr as *mut libc::c_char) }; - Errno::result(res).map(drop) -} - // TODO implement pub fn set_name(_name: &str) -> Result<()> { Ok(()) } +/// If None, it will generate a default path for cgroups. +pub fn get_cgroup_path(cgroups_path: &Option, container_id: &str) -> PathBuf { + match cgroups_path { + Some(cpath) => cpath.clone(), + None => PathBuf::from(format!("/youki/{}", container_id)), + } +} + +pub fn delete_with_retry>(path: P) -> Result<()> { + let mut attempts = 0; + let mut delay = Duration::from_millis(10); + let path = path.as_ref(); + + while attempts < 5 { + if fs::remove_dir(path).is_ok() { + return Ok(()); + } + + std::thread::sleep(delay); + attempts += attempts; + delay *= attempts; + } + + bail!("could not delete {:?}", path) +} + +pub fn write_file, C: AsRef<[u8]>>(path: P, contents: C) -> Result<()> { + let path = path.as_ref(); + fs::write(path, contents).with_context(|| format!("failed to write to {:?}", path))?; + Ok(()) +} + +pub fn create_dir_all>(path: P) -> Result<()> { + let path = path.as_ref(); + fs::create_dir_all(path).with_context(|| format!("failed to create directory {:?}", path)) +} + +pub fn open>(path: P) -> Result { + let path = path.as_ref(); + File::open(path).with_context(|| format!("failed to open {:?}", path)) +} + +pub struct TempDir { + path: Option, +} + +impl TempDir { + pub fn new>(path: P) -> Result { + let p = path.into(); + std::fs::create_dir_all(&p)?; + Ok(Self { path: Some(p) }) + } + + pub fn path(&self) -> &Path { + self.path + .as_ref() + .expect("temp dir has already been removed") + } + + pub fn remove(&mut self) { + if let Some(p) = &self.path { + let _ = fs::remove_dir_all(p); + self.path = None; + } + } +} + +impl Drop for TempDir { + fn drop(&mut self) { + self.remove(); + } +} + +impl AsRef for TempDir { + fn as_ref(&self) -> &Path { + self.path() + } +} + +impl Deref for TempDir { + type Target = Path; + + fn deref(&self) -> &Self::Target { + self.path() + } +} + +pub fn create_temp_dir(test_name: &str) -> Result { + let dir = TempDir::new(std::env::temp_dir().join(test_name))?; + Ok(dir) +} + #[cfg(test)] mod tests { use super::*; @@ -80,11 +172,21 @@ mod tests { #[test] fn test_join_absolute_path_error() { + assert!(PathBuf::from("sample/a/") + .join_absolute_path(&PathBuf::from("b/c")) + .is_err(),); + } + + #[test] + fn test_get_cgroup_path() { + let cid = "sample_container_id"; assert_eq!( - PathBuf::from("sample/a/") - .join_absolute_path(&PathBuf::from("b/c")) - .is_err(), - true + get_cgroup_path(&None, cid), + PathBuf::from("/youki/sample_container_id") + ); + assert_eq!( + get_cgroup_path(&Some(PathBuf::from("/youki")), cid), + PathBuf::from("/youki") ); } } diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000..bacb5bc9ed --- /dev/null +++ b/tests/README.md @@ -0,0 +1,8 @@ +# Integration test + +## Usage +Here is a preview implementation of the integration test. + +``` +$ cargo test --test integration +``` diff --git a/tests/integration.rs b/tests/integration.rs new file mode 100644 index 0000000000..497ffdac69 --- /dev/null +++ b/tests/integration.rs @@ -0,0 +1,19 @@ +use std::env; +use std::path::PathBuf; +use std::process::{Command, Stdio}; + +#[test] +fn main() { + let current_dir_path_result = env::current_dir(); + let current_dir_path = match current_dir_path_result { + Ok(path_buf) => path_buf, + Err(_) => panic!("directory is not found"), + }; + let youki_path = current_dir_path.join(PathBuf::from("youki")); + let status = Command::new(youki_path) + .stdout(Stdio::null()) + .arg("-h") + .status() + .expect("failed to execute process"); + assert!(status.success()); +}