diff --git a/Cargo.lock b/Cargo.lock index fcb3df086cd..6865762136f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,9 +64,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -79,39 +79,45 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.8" +version = "3.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", "windows-sys", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -246,9 +252,9 @@ checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "bumpalo" -version = "3.17.0" +version = "3.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" [[package]] name = "byteorder" @@ -274,9 +280,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.25" +version = "1.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" +checksum = "956a5e21988b87f372569b66183b78babf23ebc2e744b733e4350a752c4dafac" dependencies = [ "jobserver", "libc", @@ -294,9 +300,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "ciborium" @@ -348,9 +354,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.39" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f" +checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" dependencies = [ "clap_builder", "clap_derive", @@ -367,9 +373,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.39" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51" +checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" dependencies = [ "anstream", "anstyle", @@ -379,9 +385,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.32" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" dependencies = [ "heck", "proc-macro2", @@ -391,9 +397,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "clippy-tracing" @@ -419,9 +425,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" @@ -673,25 +679,14 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi", ] [[package]] @@ -722,9 +717,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.3" +version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" [[package]] name = "heck" @@ -850,7 +845,7 @@ version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ - "getrandom 0.3.2", + "getrandom", "libc", ] @@ -912,7 +907,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.0", ] [[package]] @@ -1058,6 +1053,7 @@ name = "pci" version = "0.1.0" dependencies = [ "byteorder", + "displaydoc", "libc", "log", "serde", @@ -1087,9 +1083,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "portable-atomic-util" @@ -1130,15 +1126,15 @@ dependencies = [ [[package]] name = "proptest" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" +checksum = "6fcdab19deb5195a31cf7726a210015ff1496ba1464fd42cb4f537b8b01b471f" dependencies = [ "bitflags 2.9.1", "lazy_static", "num-traits", - "rand 0.8.5", - "rand_chacha 0.3.1", + "rand", + "rand_chacha", "rand_xorshift", "regex-syntax", "unarray", @@ -1159,35 +1155,14 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - [[package]] name = "rand" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.3", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", + "rand_chacha", + "rand_core", ] [[package]] @@ -1197,16 +1172,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.3", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", + "rand_core", ] [[package]] @@ -1215,16 +1181,16 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.2", + "getrandom", ] [[package]] name = "rand_xorshift" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" dependencies = [ - "rand_core 0.6.4", + "rand_core", ] [[package]] @@ -1365,9 +1331,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.8" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" dependencies = [ "serde", ] @@ -1417,9 +1383,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.101" +version = "2.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "f6397daf94fa90f058bd0fd88429dd9e5738999cca8d701813c80723add80462" dependencies = [ "proc-macro2", "quote", @@ -1487,9 +1453,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.22" +version = "0.8.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ae329d1f08c4d17a59bed7ff5b5a769d062e64a62d34a3261b219e62cd5aae" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", @@ -1499,18 +1465,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.9" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.22.26" +version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ "indexmap", "serde", @@ -1522,9 +1488,9 @@ dependencies = [ [[package]] name = "toml_write" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb942dfe1d8e29a7ee7fcbde5bd2b9a25fb89aa70caea2eba3bee836ff41076" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" [[package]] name = "typenum" @@ -1619,9 +1585,9 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ - "getrandom 0.3.2", + "getrandom", "js-sys", - "rand 0.9.1", + "rand", "wasm-bindgen", ] @@ -1657,6 +1623,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c2fce39487bd03b5b0ab176f584682e9eaab7875254bafd3d188c69c85fce6e" dependencies = [ "libc", + "serde", "thiserror 2.0.12", ] @@ -1697,11 +1664,13 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.9.1", + "byteorder", "crc64", "criterion", "derive_more", @@ -1762,12 +1731,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -1884,7 +1847,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -1893,14 +1856,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1909,48 +1888,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.10" diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index db3abe1eced..e3aaeaf911b 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -1017,6 +1017,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 95ceca1b7ef..3dcdbf659d1 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1149,6 +1149,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 57aeabc1648..9f659f16c8c 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -42,7 +42,10 @@ serde_json = "1.0.140" [dev-dependencies] cargo_toml = "0.22.1" libc = "0.2.172" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index c88cd270b23..3549d5010fe 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -13,6 +13,7 @@ default = [] [dependencies] byteorder = "1.5.0" +displaydoc = "0.2.5" libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index cb42b4ee9c5..775238edff9 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -24,7 +24,7 @@ const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PciRootError { /// Could not allocate device address space for the device. AllocateDeviceAddrs(PciDeviceError), @@ -103,7 +103,7 @@ impl PciDevice for PciRoot { pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + pub devices: HashMap>>, device_reloc: Arc, device_ids: Vec, } diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 3a53167148c..c37f8026fbe 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -409,7 +409,7 @@ struct PciBar { r#type: Option, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct PciConfigurationState { registers: Vec, writable_bits: Vec, @@ -466,7 +466,7 @@ impl From for PciBarType { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum PciBarPrefetchable { NotPrefetchable = 0, Prefetchable = 0x08, @@ -481,7 +481,7 @@ impl From for bool { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct PciBarConfiguration { addr: u64, size: u64, diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index d3bd3056a36..bf89331faa9 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -6,7 +6,6 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::fmt::{self, Display}; use std::sync::{Arc, Barrier}; use std::{io, result}; @@ -16,39 +15,21 @@ use vm_device::Resource; use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { - /// Setup of the device capabilities failed. + /// Setup of the device capabilities failed: {0}. CapabilitiesSetup(configuration::Error), - /// Allocating space for an IO BAR failed. + /// Allocating space for an IO BAR failed, size={0}. IoAllocationFailed(u64), - /// Registering an IO BAR failed. + /// Registering an IO BAR at address {0} failed: {1} IoRegistrationFailed(u64, configuration::Error), /// Expected resource not found. MissingResource, - /// Invalid resource. + /// Invalid resource InvalidResource(Resource), } pub type Result = std::result::Result; -impl Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - - match self { - CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), - IoAllocationFailed(size) => { - write!(f, "failed to allocate space for an IO BAR, size={size}") - } - IoRegistrationFailed(addr, e) => { - write!(f, "failed to register an IO BAR, addr={addr} err={e}") - } - MissingResource => write!(f, "failed to find expected resource"), - InvalidResource(r) => write!(f, "invalid resource {r:?}"), - } - } -} - #[derive(Clone, Copy)] pub struct BarReprogrammingParams { pub old_base: u64, diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 2672159e474..3162da292de 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -24,15 +24,18 @@ use serde::de::Visitor; pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, - PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, - PCI_CONFIGURATION_ID, + PciClassCode, PciConfiguration, PciConfigurationState, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, PCI_CONFIGURATION_ID, }; pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msix::{ + Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, + MSIX_TABLE_ENTRY_SIZE, +}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index 4b3cf688980..be5aa3b8cf1 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -26,7 +26,7 @@ const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; pub const MSIX_CONFIG_ID: &str = "msix_config"; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { /// Failed enabling the interrupt route. EnableInterruptRoute(io::Error), @@ -59,7 +59,7 @@ impl Default for MsixTableEntry { } } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct MsixConfigState { table_entries: Vec, pba_entries: Vec, @@ -71,11 +71,23 @@ pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + pub interrupt_source_group: Arc, masked: bool, enabled: bool, } +impl std::fmt::Debug for MsixConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MsixConfig") + .field("table_entries", &self.table_entries) + .field("pba_entries", &self.pba_entries) + .field("devid", &self.devid) + .field("masked", &self.masked) + .field("enabled", &self.enabled) + .finish() + } +} + impl MsixConfig { pub fn new( msix_vectors: u16, diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index f4aec52a2e0..da5d87a4e1a 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -172,7 +172,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// to inject interrupts into a guest, by writing to the file returned /// by this method. #[allow(unused_variables)] - fn notifier(&self, index: InterruptIndex) -> Option; + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd>; /// Update the interrupt source group configuration. /// diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index f71f74db7dd..8dd8192e42d 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,11 +17,13 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +anyhow = "1.0.98" arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.13.1", features = ["bindgen"] } base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" +byteorder = "1.5.0" crc64 = "2.0.0" derive_more = { version = "2.0.1", default-features = false, features = [ "from", @@ -50,7 +52,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.2" +vm-allocator = { version = "0.1.2", features = ["serde"] } vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a3e471aed9e..51711d9eb92 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -12,8 +12,8 @@ use crate::acpi::x86_64::{ }; use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; mod x86_64; @@ -80,7 +80,11 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { + fn build_dsdt( + &mut self, + device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, + ) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data @@ -99,7 +103,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -193,26 +197,16 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - - let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; - let madt_addr = writer.build_madt( - &device_manager.resource_allocator, - vcpus.len().try_into().unwrap(), - )?; - let mcfg_addr = writer.build_mcfg( - &device_manager.resource_allocator, - layout::PCI_MMCONFIG_START, - )?; - let xsdt_addr = writer.build_xsdt( - &device_manager.resource_allocator, - fadt_addr, - madt_addr, - mcfg_addr, - )?; + let dsdt_addr = writer.build_dsdt(device_manager, resource_allocator)?; + + let fadt_addr = writer.build_fadt(resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt(resource_allocator, vcpus.len().try_into().unwrap())?; + let mcfg_addr = writer.build_mcfg(resource_allocator, layout::PCI_MMCONFIG_START)?; + let xsdt_addr = writer.build_xsdt(resource_allocator, fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } @@ -224,8 +218,8 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::device_manager::resources::ResourceAllocator; use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -259,14 +253,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -281,27 +275,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index de850a9989f..53eeac7b5e2 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -3,10 +3,7 @@ use std::mem::size_of; -use acpi_tables::fadt::{ - IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, IAPC_BOOT_ARG_FLAGS_PCI_ASPM, - IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT, -}; +use acpi_tables::fadt::IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT; use acpi_tables::madt::{IoAPIC, LocalAPIC}; use acpi_tables::{Fadt, aml}; use vm_memory::GuestAddress; @@ -33,11 +30,7 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - (1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT) - | (1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM) - | (1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT), - ); + fadt.setup_iapc_flags(1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT); } #[inline(always)] diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 8e67a50bd64..6239d8196c1 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -22,12 +22,15 @@ use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; +use crate::logger::info; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; // This is a value for uniquely identifying the FDT node declaring the interrupt controller. const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node declaring the MSI controller. +const MSI_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -302,6 +305,17 @@ fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), Fd ]; fdt.property_array_u32("interrupts", &gic_intr)?; + + if let Some(msi_properties) = gic_device.msi_properties() { + info!("msi_properties: {msi_properties:#?}"); + let msic_node = fdt.begin_node("msic")?; + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_u32("phandle", MSI_PHANDLE)?; + fdt.property_array_u64("reg", msi_properties)?; + fdt.end_node(msic_node)?; + } + fdt.end_node(interrupt)?; Ok(()) @@ -471,6 +485,21 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, ]; + + // See kernel document Documentation/devicetree/bindings/pci/pci-msi.txt + let msi_map = [ + // rid-base: A single cell describing the first RID matched by the entry. + 0x0, + // msi-controller: A single phandle to an MSI controller. + MSI_PHANDLE, + // msi-base: An msi-specifier describing the msi-specifier produced for the + // first RID matched by the entry. + segment.id as u32, + // length: A single cell describing how many consecutive RIDs are matched + // following the rid-base. + 0x100, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; fdt.property_string("compatible", "pci-host-ecam-generic")?; @@ -491,6 +520,9 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), fdt.property_null("interrupt-map")?; fdt.property_null("interrupt-map-mask")?; fdt.property_null("dma-coherent")?; + fdt.property_array_u32("msi-map", &msi_map)?; + fdt.property_u32("msi-parent", MSI_PHANDLE)?; + Ok(fdt.end_node(pci_node)?) } @@ -499,17 +531,16 @@ mod tests { use std::ffi::CString; use std::sync::{Arc, Mutex}; - use kvm_ioctls::Kvm; use linux_loader::cmdline as kernel_cmdline; use super::*; - use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; use crate::device_manager::mmio::tests::DummyDevice; use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; + use crate::{EventManager, Kvm, Vm}; // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. @@ -525,9 +556,9 @@ mod tests { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut event_manager = EventManager::new().unwrap(); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -537,14 +568,7 @@ mod tests { let dummy = Arc::new(Mutex::new(DummyDevice::new())); device_manager .mmio_devices - .register_virtio_test_device( - &vm, - mem.clone(), - &device_manager.resource_allocator, - dummy, - &mut cmdline, - "dummy", - ) + .register_virtio_test_device(&vm, mem.clone(), dummy, &mut cmdline, "dummy") .unwrap(); create_fdt( @@ -562,9 +586,9 @@ mod tests { fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -585,9 +609,9 @@ mod tests { fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_GICv3.dtb"), @@ -642,9 +666,9 @@ mod tests { fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_initrd_GICv3.dtb"), diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index c4b9208a0a6..dfa2302d6be 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -68,7 +68,9 @@ impl GICv2 { GICv2::get_cpu_addr(), GICv2::get_cpu_size(), ], + msi_properties: None, vcpu_count, + its_device: None, }) } @@ -82,7 +84,7 @@ impl GICv2 { pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs index 8bb26ce2bcd..2b617716fe2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs @@ -22,6 +22,7 @@ pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { Ok(GicState { dist: dist_regs::get_dist_regs(fd)?, gic_vcpu_states: vcpu_states, + ..Default::default() }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 39c4e5ce148..62b0419b3d2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -18,12 +18,19 @@ impl std::ops::Deref for GICv3 { } } +impl std::ops::DerefMut for GICv3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl GICv3 { // Unfortunately bindgen omits defines that are based on other defines. // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. const SZ_64K: u64 = 0x0001_0000; const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + const GIC_V3_ITS_SIZE: u64 = 0x2_0000; // Device trees specific constants const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; @@ -48,6 +55,16 @@ impl GICv3 { vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE } + /// Get the MSI address + fn get_msi_address(vcpu_count: u64) -> u64 { + Self::get_redists_addr(vcpu_count) - GICv3::GIC_V3_ITS_SIZE + } + + /// Get the MSI size + const fn get_msi_size() -> u64 { + GICv3::GIC_V3_ITS_SIZE + } + pub const VERSION: u32 = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; pub fn fdt_compatibility(&self) -> &str { @@ -59,30 +76,43 @@ impl GICv3 { } /// Create the GIC device object - pub fn create_device(fd: DeviceFd, vcpu_count: u64) -> Self { - GICv3(super::GIC { - fd, + pub fn create_device(vm: &VmFd, vcpu_count: u64) -> Result { + // Create the GIC device + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::VERSION, + fd: 0, + flags: 0, + }; + + let gic_fd = vm + .create_device(&mut gic_device) + .map_err(GicError::CreateGIC)?; + + Ok(GICv3(super::GIC { + fd: gic_fd, properties: [ GICv3::get_dist_addr(), GICv3::get_dist_size(), GICv3::get_redists_addr(vcpu_count), GICv3::get_redists_size(vcpu_count), ], + msi_properties: Some([GICv3::get_msi_address(vcpu_count), GICv3::get_msi_size()]), vcpu_count, - }) + its_device: None, + })) } pub fn save_device(&self, mpidrs: &[u64]) -> Result { - regs::save_state(&self.fd, mpidrs) + regs::save_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs) } pub fn restore_device(&self, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - regs::restore_state(&self.fd, mpidrs, state) + regs::restore_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs, state) } pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -104,25 +134,45 @@ impl GICv3 { Ok(()) } - /// Initialize a GIC device - pub fn init_device(vm: &VmFd) -> Result { - let mut gic_device = kvm_bindings::kvm_create_device { - type_: Self::VERSION, + fn init_its(vm: &VmFd, gic_device: &mut Self) -> Result<(), GicError> { + // ITS part attributes + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, fd: 0, flags: 0, }; - vm.create_device(&mut gic_device) - .map_err(GicError::CreateGIC) + let its_fd = vm + .create_device(&mut its_device) + .map_err(GicError::CreateGIC)?; + + // Setting up the ITS attributes + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &Self::get_msi_address(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + gic_device.its_device = Some(its_fd); + Ok(()) } /// Method to initialize the GIC device pub fn create(vm: &VmFd, vcpu_count: u64) -> Result { - let vgic_fd = Self::init_device(vm)?; - - let device = Self::create_device(vgic_fd, vcpu_count); + let mut device = Self::create_device(vm, vcpu_count)?; Self::init_device_attributes(&device)?; + Self::init_its(vm, &mut device)?; Self::finalize_device(&device)?; @@ -184,16 +234,27 @@ impl GICv3 { /// RDIST pending tables into guest RAM. /// /// The tables get flushed to guest RAM whenever the VM gets stopped. -fn save_pending_tables(fd: &DeviceFd) -> Result<(), GicError> { +fn save_pending_tables(gic_device: &DeviceFd, its_device: &DeviceFd) -> Result<(), GicError> { let init_gic_attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), addr: 0, flags: 0, }; - fd.set_device_attr(&init_gic_attr).map_err(|err| { + gic_device.set_device_attr(&init_gic_attr).map_err(|err| { GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) - }) + })?; + let save_its_tables_attr = kvm_bindings::kvm_device_attr { + group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + attr: u64::from(kvm_bindings::KVM_DEV_ARM_ITS_SAVE_TABLES), + addr: 0, + flags: 0, + }; + its_device + .set_device_attr(&save_its_tables_attr) + .map_err(|err| { + GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) + }) } #[cfg(test)] @@ -211,11 +272,11 @@ mod tests { let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); - save_pending_tables(gic.device_fd()).unwrap(); + save_pending_tables(gic.device_fd(), gic.its_fd().unwrap()).unwrap(); unsafe { libc::close(gic.device_fd().as_raw_fd()) }; - let res = save_pending_tables(gic.device_fd()); + let res = save_pending_tables(gic.device_fd(), gic.its_fd().unwrap()); assert_eq!( format!("{:?}", res.unwrap_err()), "DeviceAttribute(Error(9), true, 4)" diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 0531766dc54..97d8b0bb055 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -9,38 +9,188 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicState, GicVcpuState}; +use log::error; + +const GITS_CTLR: u32 = 0x0000; +const GITS_IIDR: u32 = 0x0004; +const GITS_CBASER: u32 = 0x0080; +const GITS_CWRITER: u32 = 0x0088; +const GITS_CREADR: u32 = 0x0090; +const GITS_BASER: u32 = 0x0100; + +pub fn gicv3_its_attr_set( + its_device: &DeviceFd, + group: u32, + attr: u32, + val: u64, +) -> Result<(), GicError> { + let gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &val as *const u64 as u64, + flags: 0, + }; + + its_device + .set_device_attr(&gicv3_its_attr) + .map_err(|err| GicError::DeviceAttribute(err, true, group)) +} + +pub fn gicv3_its_attr_get(its_device: &DeviceFd, group: u32, attr: u32) -> Result { + let mut val = 0; + + let mut gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &mut val as *mut u64 as u64, + flags: 0, + }; + + // SAFETY: gicv3_its_attr.addr is safe to write to. + unsafe { its_device.get_device_attr(&mut gicv3_its_attr) } + .map_err(|err| GicError::DeviceAttribute(err, false, group))?; + + Ok(val) +} /// Save the state of the GIC device. -pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { +pub fn save_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], +) -> Result { // Flush redistributors pending tables to guest RAM. - super::save_pending_tables(fd)?; + super::save_pending_tables(gic_device, its_device)?; let mut vcpu_states = Vec::with_capacity(mpidrs.len()); for mpidr in mpidrs { vcpu_states.push(GicVcpuState { - rdist: redist_regs::get_redist_regs(fd, *mpidr)?, - icc: icc_regs::get_icc_regs(fd, *mpidr)?, + rdist: redist_regs::get_redist_regs(gic_device, *mpidr)?, + icc: icc_regs::get_icc_regs(gic_device, *mpidr)?, }) } + let mut its_baser: [u64; 8] = [0; 8]; + for i in 0..8 { + its_baser[i as usize] = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_BASER + i * 8, + )?; + } + + let its_ctlr = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CTLR, + )?; + + let its_cbaser = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CBASER, + )?; + + let its_creadr = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CREADR, + )?; + + let its_cwriter = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CWRITER, + )?; + + let its_iidr = gicv3_its_attr_get( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_IIDR, + )?; + Ok(GicState { - dist: dist_regs::get_dist_regs(fd)?, + dist: dist_regs::get_dist_regs(gic_device)?, gic_vcpu_states: vcpu_states, + its_ctlr, + its_cbaser, + its_creadr, + its_cwriter, + its_iidr, + its_baser, }) } /// Restore the state of the GIC device. -pub fn restore_state(fd: &DeviceFd, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - dist_regs::set_dist_regs(fd, &state.dist)?; +pub fn restore_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], + state: &GicState, +) -> Result<(), GicError> { + dist_regs::set_dist_regs(gic_device, &state.dist)?; if mpidrs.len() != state.gic_vcpu_states.len() { return Err(GicError::InconsistentVcpuCount); } for (mpidr, vcpu_state) in mpidrs.iter().zip(&state.gic_vcpu_states) { - redist_regs::set_redist_regs(fd, *mpidr, &vcpu_state.rdist)?; - icc_regs::set_icc_regs(fd, *mpidr, &vcpu_state.icc)?; + redist_regs::set_redist_regs(gic_device, *mpidr, &vcpu_state.rdist)?; + icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } + // Restore ITS registers + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_IIDR, + state.its_iidr, + )?; + + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CBASER, + state.its_cbaser, + )?; + + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CREADR, + state.its_creadr, + )?; + + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CWRITER, + state.its_cwriter, + )?; + + for i in 0..8 { + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_BASER + i * 8, + state.its_baser[i as usize], + )?; + } + + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + kvm_bindings::KVM_DEV_ARM_ITS_RESTORE_TABLES, + 0, + ) + .inspect_err(|err| error!("its: could not restore tables: {err:#?}"))?; + + gicv3_its_attr_set( + its_device, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ITS_REGS, + GITS_CTLR, + state.its_ctlr, + )?; + Ok(()) } @@ -59,9 +209,10 @@ mod tests { let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); let mpidr = vec![1]; - let res = save_state(gic_fd, &mpidr); + let res = save_state(gic_fd, its_fd, &mpidr); // We will receive an error if trying to call before creating vcpu. assert_eq!( format!("{:?}", res.unwrap_err()), @@ -73,8 +224,9 @@ mod tests { let _vcpu = vm.create_vcpu(0).unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); - let vm_state = save_state(gic_fd, &mpidr).unwrap(); + let vm_state = save_state(gic_fd, its_fd, &mpidr).unwrap(); let val: u32 = 0; let gicd_statusr_off = 0x0010u64; let mut gic_dist_attr = kvm_bindings::kvm_device_attr { @@ -94,7 +246,7 @@ mod tests { assert_eq!(gicd_statusr.chunks[0], val); assert_eq!(vm_state.dist.len(), 12); - restore_state(gic_fd, &mpidr, &vm_state).unwrap(); - restore_state(gic_fd, &[1, 2], &vm_state).unwrap_err(); + restore_state(gic_fd, its_fd, &mpidr, &vm_state).unwrap(); + restore_state(gic_fd, its_fd, &[1, 2], &vm_state).unwrap_err(); } } diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index cda423f478c..9bfabee1fea 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -21,8 +21,14 @@ pub struct GIC { /// GIC device properties, to be used for setting up the fdt entry properties: [u64; 4], + /// MSI properties of the GIC device + msi_properties: Option<[u64; 2]>, + /// Number of CPUs handled by the device vcpu_count: u64, + + /// ITS device + its_device: Option, } impl GIC { /// Returns the file descriptor of the GIC device @@ -80,6 +86,14 @@ impl GICDevice { } } + /// Returns the file descriptor of the ITS device, if any + pub fn its_fd(&self) -> Option<&DeviceFd> { + match self { + Self::V2(_) => None, + Self::V3(x) => x.its_device.as_ref(), + } + } + /// Returns an array with GIC device properties pub fn device_properties(&self) -> &[u64] { match self { @@ -88,6 +102,14 @@ impl GICDevice { } } + /// Returns an array with MSI properties if GIC supports it + pub fn msi_properties(&self) -> Option<&[u64; 2]> { + match self { + Self::V2(x) => x.msi_properties.as_ref(), + Self::V3(x) => x.msi_properties.as_ref(), + } + } + /// Returns the number of vCPUs this GIC handles pub fn vcpu_count(&self) -> u64 { match self { diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 60987cc973d..d437da586b3 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -30,6 +30,18 @@ pub struct GicState { pub dist: Vec>, /// The state of the vcpu interfaces. pub gic_vcpu_states: Vec, + /// ITS control register + pub its_ctlr: u64, + /// ITS IID register + pub its_iidr: u64, + /// ITS CBASE register + pub its_cbaser: u64, + /// ITS CWRITE register + pub its_cwriter: u64, + /// ITS CREAD register + pub its_creadr: u64, + /// ITS BASE registers + pub its_baser: [u64; 8], } /// Structure used for serializing the state of the GIC registers for a specific vCPU. diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index df6e712dcf5..a599db5dea7 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -32,7 +32,7 @@ use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -82,8 +82,11 @@ pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -103,11 +106,11 @@ pub fn configure_system_for_boot( cpu_config, }; - let optional_capabilities = vmm.kvm.optional_capabilities(); + let optional_capabilities = kvm.optional_capabilities(); // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - vmm.vm.guest_memory(), + vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -123,18 +126,16 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - vmm.vm.guest_memory(), + vm.guest_memory(), vcpu_mpidr, cmdline, - &vmm.device_manager, - vmm.vm.get_irqchip(), + device_manager, + vm.get_irqchip(), initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); - vmm.vm - .guest_memory() - .write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vm.guest_memory())); + vm.guest_memory().write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 03fba87f4fe..35f4e9b63a3 100644 Binary files a/src/vmm/src/arch/aarch64/output_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index 90e4a6cc0e2..fb6147ade9c 100644 Binary files a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb differ diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index fe1296e5d1c..5307dbdf710 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,6 +33,7 @@ pub mod generated; use std::fs::File; +use kvm::Kvm; use layout::{ CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, @@ -53,6 +54,7 @@ use crate::acpi::create_acpi_tables; use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; +use crate::device_manager::DeviceManager; use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; @@ -60,7 +62,7 @@ use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{Vcpu, VcpuConfig, Vm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -169,8 +171,11 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -179,8 +184,7 @@ pub fn configure_system_for_boot( boot_cmdline: Cmdline, ) -> Result<(), ConfigurationError> { // Construct the base CpuConfiguration to apply CPU template onto. - let cpu_config = - CpuConfiguration::new(vmm.kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; + let cpu_config = CpuConfiguration::new(kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; // Apply CPU template to the base CpuConfiguration. let cpu_config = CpuConfiguration::apply_template(cpu_config, cpu_template)?; @@ -193,7 +197,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; + .configure(vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -204,7 +208,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -212,19 +216,19 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - vmm.vm.guest_memory(), - &vmm.device_manager.resource_allocator, + vm.guest_memory(), + &vm.common.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -234,7 +238,12 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; + create_acpi_tables( + vm.guest_memory(), + device_manager, + &vm.common.resource_allocator, + vcpus, + )?; Ok(()) } @@ -564,9 +573,9 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; - use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; + use crate::vstate::resources::ResourceAllocator; #[test] fn regions_lt_4gb() { diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index c397290c23e..17b2900aeb2 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -15,10 +15,10 @@ use vm_allocator::AllocPolicy; use crate::arch::IRQ_MAX; use crate::arch::x86_64::generated::mpspec; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, }; +use crate::vstate::resources::ResourceAllocator; // These `mpspec` wrapper types are only data, reading them from data is a safe initialization. // SAFETY: POD diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..fbc27c82a60 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -11,8 +12,10 @@ use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocatorState; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -56,6 +59,8 @@ pub struct ArchVm { /// /// `None` if `KVM_CAP_XSAVE2` not supported. xsave2_size: Option, + /// Port IO bus + pub pio_bus: Arc, } impl ArchVm { @@ -90,10 +95,13 @@ impl ArchVm { .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; + let pio_bus = Arc::new(vm_device::Bus::new()); + Ok(ArchVm { common, msrs_to_save, xsave2_size, + pio_bus, }) } @@ -187,6 +195,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + resource_allocator: self.common.resource_allocator.save(), pitstate, clock, pic_master, @@ -211,6 +220,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// resource allocator + pub resource_allocator: ResourceAllocatorState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 2c037fc529f..e196ef505c2 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -16,17 +16,18 @@ use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "aarch64")] +use crate::Vcpu; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; -use crate::cpu_config::templates::{ - GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, -}; -#[cfg(target_arch = "aarch64")] -use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; +#[cfg(target_arch = "x86_64")] +use crate::device_manager; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, + AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, + DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -41,26 +42,28 @@ use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; +use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; -use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; -use crate::{EventManager, Vmm, VmmError, device_manager}; +#[cfg(target_arch = "aarch64")] +use crate::vstate::resources::ResourceAllocator; +use crate::vstate::vcpu::VcpuError; +use crate::vstate::vm::{Vm, VmError}; +use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), - /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(#[from] AttachVmgenidError), - #[cfg(target_arch = "aarch64")] - /// Unable to attach legacy MMIO devices: {0} - AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), + /// Could not attach device: {0} + AttachDevice(#[from] AttachDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), + /// Failed to create device manager: {0} + CreateDeviceManager(#[from] DeviceManagerCreateError), /// Failed to create guest config: {0} CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} @@ -87,6 +90,8 @@ pub enum StartMicrovmError { GetCpuTemplate(#[from] GetCpuTemplateError), /// Invalid kernel command line: {0} KernelCmdline(String), + /// Kvm error: {0} + Kvm(#[from] KvmError), /// Cannot load command line string: {0} LoadCommandline(linux_loader::loader::Error), /// Cannot start microvm without kernel configuration. @@ -99,8 +104,6 @@ pub enum StartMicrovmError { NetDeviceNotConfigured, /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), - /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -115,6 +118,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error with the Vm object: {0} + Vm(#[from] VmError), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -125,37 +130,6 @@ impl std::convert::From for StartMicrovmError { } } -#[cfg_attr(target_arch = "aarch64", allow(unused))] -fn create_vmm_and_vcpus( - instance_info: &InstanceInfo, - event_manager: &mut EventManager, - vcpu_count: u8, - kvm_capabilities: Vec, -) -> Result<(Vmm, Vec), VmmError> { - let kvm = Kvm::new(kvm_capabilities)?; - // Set up Kvm Vm and register memory regions. - // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; - - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - - let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - - let vmm = Vmm { - events_observer: Some(std::io::stdin()), - instance_info: instance_info.clone(), - shutdown_exit_code: None, - kvm, - vm, - uffd: None, - vcpus_handles: Vec::new(), - vcpus_exit_evt, - device_manager, - }; - - Ok((vmm, vcpus)) -} - /// Builds and starts a microVM based on the current Firecracker VmResources configuration. /// /// The built microVM and all the created vCPUs start off in the paused state. @@ -167,8 +141,6 @@ pub fn build_microvm_for_boot( event_manager: &mut EventManager, seccomp_filters: &BpfThreadMap, ) -> Result>, StartMicrovmError> { - use self::StartMicrovmError::*; - // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); @@ -176,7 +148,7 @@ pub fn build_microvm_for_boot( .boot_source .builder .as_ref() - .ok_or(MissingKernelConfig)?; + .ok_or(StartMicrovmError::MissingKernelConfig)?; let guest_memory = vm_resources .allocate_guest_memory() @@ -191,19 +163,19 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - cpu_template.kvm_capabilities.clone(), - )?; + let kvm = Kvm::new(cpu_template.kvm_capabilities.clone())?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm)?; + let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; + vm.register_memory_regions(guest_memory)?; + + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm)?; + let vm = Arc::new(vm); - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -214,11 +186,11 @@ pub fn build_microvm_for_boot( #[cfg(feature = "gdb")] let vcpu_fds = vcpus .iter() - .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) + .map(|vcpu| vcpu.copy_kvm_vcpu_fd(&vm)) .collect::, _>>()?; if vm_resources.pci_enabled { - vmm.device_manager.enable_pci()?; + device_manager.enable_pci(&vm)?; } else { boot_cmdline.insert("pci", "off")?; } @@ -227,53 +199,70 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - vmm.device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(&vm, request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { - attach_balloon_device(&mut vmm, &mut boot_cmdline, balloon, event_manager)?; + attach_balloon_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + balloon, + event_manager, + )?; } attach_block_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, )?; attach_net_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + unix_vsock, + event_manager, + )?; } if let Some(entropy) = vm_resources.entropy.get() { - attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; + attach_entropy_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + entropy, + event_manager, + )?; } #[cfg(target_arch = "aarch64")] - vmm.device_manager.attach_legacy_devices_aarch64( - vmm.vm.fd(), - event_manager, - &mut boot_cmdline, - )?; + device_manager.attach_legacy_devices_aarch64(&vm, event_manager, &mut boot_cmdline)?; - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut vmm, &mut vcpus)?; + setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } configure_system_for_boot( - &mut vmm, + &kvm, + &vm, + &mut device_manager, vcpus.as_mut(), &vm_resources.machine_config, &cpu_template, @@ -282,6 +271,18 @@ pub fn build_microvm_for_boot( boot_cmdline, )?; + let vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm, + uffd: None, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; + let vmm = Arc::new(Mutex::new(vmm)); #[cfg(feature = "gdb")] @@ -293,7 +294,7 @@ pub fn build_microvm_for_boot( entry_point.entry_addr, gdb_socket_path, ) - .map_err(GdbServer)?; + .map_err(StartMicrovmError::GdbServer)?; } else { debug!("No GDB socket provided not starting gdb server."); } @@ -305,7 +306,7 @@ pub fn build_microvm_for_boot( vcpus, seccomp_filters .get("vcpu") - .ok_or_else(|| MissingSeccompFilters("vcpu".to_string()))? + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vcpu".to_string()))? .clone(), ) .map_err(VmmError::VcpuStart)?; @@ -317,7 +318,7 @@ pub fn build_microvm_for_boot( crate::seccomp::apply_filter( seccomp_filters .get("vmm") - .ok_or_else(|| MissingSeccompFilters("vmm".to_string()))?, + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vmm".to_string()))?, ) .map_err(VmmError::SeccompFilters)?; @@ -402,19 +403,19 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - ) - .map_err(StartMicrovmError::Internal)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm) - .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) + .map_err(StartMicrovmError::Kvm)?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm).map_err(StartMicrovmError::Vm)?; + + let (mut vcpus, vcpus_exit_evt) = vm + .create_vcpus(vm_resources.machine_config.vcpu_count) + .map_err(StartMicrovmError::Vm)?; + + vm.register_memory_regions(guest_memory) + .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -430,16 +431,6 @@ pub fn build_microvm_from_snapshot( } } - // Restore allocator state - #[cfg(target_arch = "aarch64")] - if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { - allocate_pvtime_region( - &mut vmm, - vcpus.len(), - vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), - )?; - } - // Restore vcpus kvm state. for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) { vcpu.kvm_vcpu @@ -452,28 +443,46 @@ pub fn build_microvm_from_snapshot( { let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. - vmm.vm.restore_state(&mpidrs, µvm_state.vm_state)?; + vm.restore_state(&mpidrs, µvm_state.vm_state)?; } // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vmm.vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; + let vm = Arc::new(vm); + // Restore devices states. + // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation + // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise + // the injected interrupt will be overwritten. let device_ctor_args = DeviceRestoreArgs { - mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + mem: vm.guest_memory(), + vm: &vm, event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: vmm.uffd.is_none(), + restored_from_file: uffd.is_none(), + vcpus_exit_evt: &vcpus_exit_evt, }; + #[allow(unused_mut)] + let mut device_manager = + DeviceManager::restore(device_ctor_args, µvm_state.device_states)?; - vmm.device_manager - .restore(µvm_state.device_states, device_ctor_args)?; + let mut vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm, + uffd, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -506,14 +515,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - vmm: &mut Vmm, + resource_allocator: &ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = vmm - .device_manager - .resource_allocator + let addr = resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; Ok(GuestAddress(addr)) @@ -521,10 +528,16 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] -fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> { +fn setup_pvtime( + resource_allocator: &ResourceAllocator, + vcpus: &mut [Vcpu], +) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region - let pvtime_mem: GuestAddress = - allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?; + let pvtime_mem: GuestAddress = allocate_pvtime_region( + resource_allocator, + vcpus.len(), + vm_allocator::AllocPolicy::LastMatch, + )?; // Register all vcpus with pvtime device for (i, vcpu) in vcpus.iter_mut().enumerate() { @@ -539,11 +552,12 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr } fn attach_entropy_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") @@ -551,18 +565,12 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), - id, - entropy_device.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, entropy_device.clone(), cmdline, false) } fn attach_block_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -584,20 +592,14 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), - id, - block.clone(), - cmdline, - is_vhost_user, - )?; + device_manager.attach_virtio_device(vm, id, block.clone(), cmdline, is_vhost_user)?; } Ok(()) } fn attach_net_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -606,54 +608,35 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), - id, - net_device.clone(), - cmdline, - false, - )?; + device_manager.attach_virtio_device(vm, id, net_device.clone(), cmdline, false)?; } Ok(()) } fn attach_unixsock_vsock_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), - id, - unix_vsock.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), - id, - balloon.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, balloon.clone(), cmdline, false) } #[cfg(test)] @@ -743,7 +726,7 @@ pub(crate) mod tests { instance_info: InstanceInfo::default(), shutdown_exit_code: None, kvm, - vm, + vm: Arc::new(vm), uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -788,7 +771,8 @@ pub(crate) mod tests { } attach_block_devices( - vmm, + &mut vmm.device_manager, + &vmm.vm, cmdline, block_dev_configs.devices.iter(), event_manager, @@ -806,7 +790,13 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ); res.unwrap(); } @@ -827,7 +817,14 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ) + .unwrap(); } pub(crate) fn insert_vsock_device( @@ -840,11 +837,17 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &vsock, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -859,11 +862,17 @@ pub(crate) mod tests { let mut builder = EntropyDeviceBuilder::new(); let entropy = builder.build(entropy_config).unwrap(); - attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); + attach_entropy_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &entropy, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -872,7 +881,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) .unwrap(); assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } @@ -887,11 +896,17 @@ pub(crate) mod tests { builder.set(balloon_config).unwrap(); let balloon = builder.get().unwrap(); - attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); + attach_balloon_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + balloon, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -943,7 +958,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -965,7 +979,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -988,7 +1001,6 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1026,19 +1038,16 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1068,7 +1077,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1090,7 +1098,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1112,7 +1119,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1124,7 +1130,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = vmm.device_manager.attach_boot_timer_device(request_ts); + let res = vmm + .device_manager + .attach_boot_timer_device(&vmm.vm, request_ts); res.unwrap(); assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 78f1254d2fa..3f0af80c7aa 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,11 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; +use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: Option, @@ -15,18 +15,14 @@ pub struct ACPIDeviceManager { impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new() -> Self { - Self { vmgenid: None } + Default::default() } /// Attach a new VMGenID device to the microVM /// /// This will register the device's interrupt with KVM - pub fn attach_vmgenid( - &mut self, - vmgenid: VmGenId, - vm_fd: &VmFd, - ) -> Result<(), kvm_ioctls::Error> { - vm_fd.register_irqfd(&vmgenid.interrupt_evt, vmgenid.gsi)?; + pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { + vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; self.vmgenid = Some(vmgenid); Ok(()) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index cedb7abc32c..d0194e24e62 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -11,11 +11,11 @@ use std::sync::{Arc, Mutex}; use acpi_tables::aml::AmlError; use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; @@ -97,11 +97,7 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices( - &mut self, - io_bus: &vm_device::Bus, - vm_fd: &VmFd, - ) -> Result<(), LegacyDeviceError> { + pub fn register_devices(&mut self, vm: &Vm) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -122,6 +118,8 @@ impl PortIODeviceManager { ), input: None, })); + + let io_bus = &vm.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -148,18 +146,15 @@ impl PortIODeviceManager { Self::I8042_KDB_DATA_REGISTER_SIZE, )?; - vm_fd - .register_irqfd(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) + vm.register_irq(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) + vm.register_irq(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.kbd_evt, Self::KBD_EVT_GSI) + vm.register_irq(&self.kbd_evt, Self::KBD_EVT_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; @@ -246,7 +241,6 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); - let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -264,6 +258,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, vm.fd()).unwrap(); + ldm.register_devices(&vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f730dd5be0d..13ab13f47ea 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -12,31 +12,25 @@ use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; -use kvm_ioctls::{IoEventAddress, VmFd}; +use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; -use log::info; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; -use super::resources::ResourceAllocator; +use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::MmioTransport; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; +use crate::vstate::resources::ResourceAllocator; /// Errors for MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -128,7 +122,7 @@ pub struct MMIODevice { } /// Manages the complexities of registering a MMIO device. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct MMIODeviceManager { /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, @@ -154,16 +148,7 @@ pub struct MMIODeviceManager { impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { - MMIODeviceManager { - virtio_devices: HashMap::new(), - boot_timer: None, - #[cfg(target_arch = "aarch64")] - rtc: None, - #[cfg(target_arch = "aarch64")] - serial: None, - #[cfg(target_arch = "x86_64")] - dsdt_data: vec![], - } + Default::default() } /// Allocates resources for a new device to be added. @@ -193,9 +178,8 @@ impl MMIODeviceManager { /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, - vm: &VmFd, + vm: &Vm, device_id: String, - mmio_bus: &vm_device::Bus, device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. @@ -210,14 +194,15 @@ impl MMIODeviceManager { let io_addr = IoEventAddress::Mmio( device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); - vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) + vm.fd() + .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq.get()) + vm.register_irq(&mmio_device.interrupt.irq_evt, irq.get()) .map_err(MmioError::RegisterIrqFd)?; } - mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -252,14 +237,13 @@ impl MMIODeviceManager { /// to the boot cmdline. pub fn register_mmio_virtio_for_boot( &mut self, - vm: &VmFd, - resource_allocator: &ResourceAllocator, + vm: &Vm, device_id: String, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -275,7 +259,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap().get(), )?; } - self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, device)?; Ok(()) } @@ -284,8 +268,7 @@ impl MMIODeviceManager { /// otherwise allocate a new MMIO resources for it. pub fn register_mmio_serial( &mut self, - vm: &VmFd, - resource_allocator: &ResourceAllocator, + vm: &Vm, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -294,7 +277,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -302,7 +285,7 @@ impl MMIODeviceManager { } }; - vm.register_irqfd( + vm.register_irq( serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap().get(), ) @@ -313,7 +296,7 @@ impl MMIODeviceManager { inner: serial, }; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -344,7 +327,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - resource_allocator: &ResourceAllocator, + vm: &Vm, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -353,7 +336,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -366,7 +349,7 @@ impl MMIODeviceManager { inner: rtc, }; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -449,79 +432,6 @@ impl MMIODeviceManager { Ok(()) } - /// Artificially kick devices as if they had external events. - pub fn kick_devices(&self) { - info!("Artificially kick devices."); - // We only kick virtio devices for now. - let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { - let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); - let mut virtio = mmio_transport_locked.locked_device(); - match *virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues(); - } - } - } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues(); - } - } - _ => (), - } - Ok(()) - }); - } - #[cfg(target_arch = "aarch64")] pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { let mut device_info = Vec::new(); @@ -566,22 +476,15 @@ pub(crate) mod tests { impl MMIODeviceManager { pub(crate) fn register_virtio_test_device( &mut self, - vm: &VmFd, + vm: &Vm, guest_mem: GuestMemoryMmap, - resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); - self.register_mmio_virtio_for_boot( - vm, - resource_allocator, - dev_id.to_string(), - mmio_device, - cmdline, - )?; + self.register_mmio_virtio_for_boot(vm, dev_id.to_string(), mmio_device, cmdline)?; Ok(self .get_virtio_device(device.lock().unwrap().device_type(), dev_id) .unwrap() @@ -688,7 +591,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -699,9 +601,8 @@ pub(crate) mod tests { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &resource_allocator, dummy, &mut cmdline, "dummy", @@ -742,7 +643,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -753,9 +653,8 @@ pub(crate) mod tests { for _i in crate::arch::IRQ_BASE..=crate::arch::IRQ_MAX { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -767,9 +666,8 @@ pub(crate) mod tests { "{}", device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -804,21 +702,13 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); let type_id = dummy.lock().unwrap().device_type(); let id = String::from("foo"); let addr = device_manager - .register_virtio_test_device( - vm.fd(), - vm.guest_memory().clone(), - &resource_allocator, - dummy, - &mut cmdline, - &id, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy, &mut cmdline, &id) .unwrap(); assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( @@ -842,14 +732,7 @@ pub(crate) mod tests { let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); device_manager - .register_virtio_test_device( - vm.fd(), - vm.guest_memory().clone(), - &resource_allocator, - dummy2, - &mut cmdline, - &id2, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy2, &mut cmdline, &id2) .unwrap(); let mut count = 0; diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2922060bb13..fae7d12748d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,20 +5,19 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::convert::Infallible; use std::fmt::Debug; use std::sync::{Arc, Mutex}; use acpi::ACPIDeviceManager; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; -use log::error; +use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; -use pci_mngr::{PciDevices, PciManagerError}; +use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; -use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; @@ -31,12 +30,18 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::{EmulateSerialInitError, EventManager}; +use crate::{EmulateSerialInitError, EventManager, Vm}; /// ACPI device manager. pub mod acpi; @@ -48,8 +53,6 @@ pub mod mmio; pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; -/// Resource manager for devices. -pub mod resources; #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while creating a new [`DeviceManager`] @@ -65,41 +68,28 @@ pub enum DeviceManagerCreateError { #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while attaching a VirtIO device -pub enum AttachMmioDeviceError { +pub enum AttachDeviceError { /// MMIO transport error: {0} MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachVmgenidError { /// Error creating VMGenID device: {0} CreateVmGenID(#[from] VmGenIdError), /// Error while registering VMGenID with KVM: {0} AttachVmGenID(#[from] kvm_ioctls::Error), -} - -#[cfg(target_arch = "aarch64")] -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachLegacyMmioDeviceError { + #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, + #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), - /// Error registering device: {0} - RegisterMMIODevice(#[from] MmioError), - /// Error inserting device in the Bus: {0} - Bus(#[from] vm_device::BusError), + /// Error attach PCI device: {0} + PciTransport(#[from] PciManagerError), } #[derive(Debug)] /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { - /// Allocator for system memory and interrupt numbers - pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] @@ -139,33 +129,38 @@ impl DeviceManager { Ok(serial) } + #[cfg(target_arch = "x86_64")] + fn create_legacy_devices( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + ) -> Result { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpus_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(vm)?; + Ok(legacy_devices) + } + #[cfg_attr(target_arch = "aarch64", allow(unused))] pub fn new( event_manager: &mut EventManager, - vcpu_exit_evt: &EventFd, - vmfd: &VmFd, + vcpus_exit_evt: &EventFd, + vm: &Vm, ) -> Result { - let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = { - Self::set_stdout_nonblocking(); - - // Create serial device - let serial = Self::setup_serial_device(event_manager)?; - let reset_evt = vcpu_exit_evt - .try_clone() - .map_err(DeviceManagerCreateError::EventFd)?; - // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); - - // create pio dev manager with legacy devices - let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; - legacy_devices - }; + let legacy_devices = Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm)?; Ok(DeviceManager { - resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, @@ -174,26 +169,41 @@ impl DeviceManager { }) } - /// Attaches a VirtioDevice device to the device manager and event manager. - pub(crate) fn attach_virtio_device( + /// Attaches an MMIO VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_mmio_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( &mut self, - mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, id: String, device: Arc>, cmdline: &mut Cmdline, is_vhost_user: bool, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); - self.mmio_devices.register_mmio_virtio_for_boot( - vmfd, - &self.resource_allocator, - id, - device, - cmdline, - )?; + let device = + MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); + self.mmio_devices + .register_mmio_virtio_for_boot(vm, id, device, cmdline)?; + + Ok(()) + } + + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + if self.pci_devices.pci_segment.is_some() { + self.pci_devices.attach_pci_virtio_device(vm, id, device)?; + } else { + self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; + } Ok(()) } @@ -201,12 +211,13 @@ impl DeviceManager { /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, + vm: &Vm, request_ts: TimestampUs, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.mmio_bus, boot_timer)?; Ok(()) } @@ -214,47 +225,185 @@ impl DeviceManager { pub(crate) fn attach_vmgenid_device( &mut self, mem: &GuestMemoryMmap, - vmfd: &VmFd, - ) -> Result<(), AttachVmgenidError> { - let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; - self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + vm: &Vm, + ) -> Result<(), AttachDeviceError> { + let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; + self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } #[cfg(target_arch = "aarch64")] pub(crate) fn attach_legacy_devices_aarch64( &mut self, - vmfd: &VmFd, + vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, - ) -> Result<(), AttachLegacyMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { // Serial device setup. let cmdline_contains_console = cmdline .as_cstring() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .into_string() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .contains("console="); if cmdline_contains_console { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices - .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; + self.mmio_devices.register_mmio_serial(vm, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices - .register_mmio_rtc(&self.resource_allocator, rtc, None)?; + self.mmio_devices.register_mmio_rtc(vm, rtc, None)?; Ok(()) } /// Enables PCIe support for Firecracker devices - pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { - self.pci_devices - .attach_pci_segment(&self.resource_allocator) + pub fn enable_pci(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + self.pci_devices.attach_pci_segment(vm) + } + + fn do_kick_device(virtio_device: Arc>) { + let mut device = virtio_device.lock().expect("Poisoned lock"); + match device.device_type() { + TYPE_BALLOON => { + let balloon = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", balloon.id()); + balloon.process_virtio_queues(); + } + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = device.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if block.is_activated() { + info!("kick block {}.", block.id()); + block.process_virtio_queues(); + } + } + } + TYPE_NET => { + let net = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", net.id()); + net.process_virtio_queues(); + } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = device + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {}.", vsock.id()); + vsock.signal_used_queue(0).unwrap(); + } + } + TYPE_RNG => { + let entropy = device.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {}.", entropy.id()); + entropy.process_virtio_queues(); + } + } + _ => (), + } + } + + /// Artificially kick VirtIO devices as if they had external events. + pub fn kick_virtio_devices(&self) { + info!("Artificially kick devices"); + // Go through MMIO VirtIO devices + let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + Self::do_kick_device(mmio_transport_locked.device()); + Ok(()) + }); + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_kick_device(virtio_device); + } + } + + fn do_mark_virtio_queue_memory_dirty( + device: Arc>, + mem: &GuestMemoryMmap, + ) { + // SAFETY: + // This should never fail as we mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + let locked_device = device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + locked_device.mark_queue_memory_dirty(mem).unwrap() + } + } + + /// Mark queue memory dirty for activated VirtIO devices + pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { + // Go through MMIO VirtIO devices + let _: Result<(), Infallible> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); + Ok(()) + }); + + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); + } + } + + /// Get a VirtIO device of type `virtio_type` with ID `device_id` + pub fn get_virtio_device( + &self, + virtio_type: u32, + device_id: &str, + ) -> Option>> { + if self.pci_devices.pci_segment.is_some() { + let pci_device = self.pci_devices.get_virtio_device(virtio_type, device_id)?; + Some( + pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .clone(), + ) + } else { + let mmio_device = self + .mmio_devices + .get_virtio_device(virtio_type, device_id)?; + Some( + mmio_device + .inner + .lock() + .expect("Poisoned lock") + .device() + .clone(), + ) + } } } @@ -283,19 +432,38 @@ pub enum DevicePersistError { SerialRestore(#[from] EmulateSerialInitError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), + /// Error creating DeviceManager: {0} + DeviceManager(#[from] DeviceManagerCreateError), } pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Arc, pub event_manager: &'a mut EventManager, + pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, } -impl DeviceManager { - pub fn save(&self) -> DevicesState { +impl std::fmt::Debug for DeviceRestoreArgs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeviceRestoreArgs") + .field("mem", &self.mem) + .field("vm", &self.vm) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + +impl<'a> Persist<'a> for DeviceManager { + type State = DevicesState; + type ConstructorArgs = DeviceRestoreArgs<'a>; + type Error = DevicePersistError; + + fn save(&self) -> Self::State { DevicesState { mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), @@ -303,6 +471,65 @@ impl DeviceManager { } } + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + // Setup legacy devices in case of x86 + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices( + constructor_args.event_manager, + constructor_args.vcpus_exit_evt, + constructor_args.vm, + )?; + + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + event_manager: constructor_args.event_manager, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + }; + let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + }; + let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + acpi_devices.notify_vmgenid()?; + + // Restore PCI devices + let pci_ctor_args = PciDevicesConstructorArgs { + vm: constructor_args.vm.clone(), + mem: constructor_args.mem, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + event_manager: constructor_args.event_manager, + }; + let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; + + let device_manager = DeviceManager { + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + }; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + device_manager.emulate_serial_init()?; + + Ok(device_manager) + } +} + +impl DeviceManager { /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial @@ -340,43 +567,6 @@ impl DeviceManager { Ok(()) } } - - pub fn restore( - &mut self, - state: &DevicesState, - restore_args: DeviceRestoreArgs, - ) -> Result<(), DevicePersistError> { - // Restore MMIO devices - let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: restore_args.mem, - vm: restore_args.vm, - event_manager: restore_args.event_manager, - resource_allocator: &self.resource_allocator, - vm_resources: restore_args.vm_resources, - instance_id: restore_args.instance_id, - restored_from_file: restore_args.restored_from_file, - }; - self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; - - // Restore serial. - // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 - self.emulate_serial_init()?; - - // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: restore_args.mem, - resource_allocator: &self.resource_allocator, - vm: restore_args.vm, - }; - self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - self.acpi_devices.notify_vmgenid()?; - - // Restore PCI devices - self.pci_devices - .restore(&state.pci_state, &self.resource_allocator)?; - - Ok(()) - } } #[cfg(test)] @@ -389,7 +579,6 @@ pub(crate) mod tests { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); let pci_devices = PciDevices::new(); - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -403,7 +592,6 @@ pub(crate) mod tests { .unwrap(); DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, @@ -422,7 +610,7 @@ pub(crate) mod tests { let mut cmdline = Cmdline::new(4096).unwrap(); let mut event_manager = EventManager::new().unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_none()); @@ -430,7 +618,7 @@ pub(crate) mod tests { let mut vmm = default_vmm(); cmdline.insert("console", "/dev/blah").unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_some()); diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index e9ada60cc1f..1fa8dea75d8 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -1,18 +1,48 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::Arc; +use std::collections::HashMap; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use kvm_ioctls::{IoEventAddress, NoDatamatch}; +use log::{debug, error, warn}; +use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use super::resources::ResourceAllocator; +use super::persist::{MmdsVersionState, SharedDeviceType}; use crate::devices::pci::PciSegment; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; +use crate::devices::virtio::block::device::Block; +use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; +use crate::devices::virtio::transport::pci::device::{ + VirtioPciDevice, VirtioPciDeviceError, VirtioPciDeviceState, +}; +use crate::devices::virtio::vsock::persist::{ + VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, +}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::resources::VmResources; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; +use crate::{EventManager, Vm}; #[derive(Debug, Default)] pub struct PciDevices { /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. pub pci_segment: Option, + /// All VirtIO PCI devices of the system + pub virtio_devices: HashMap<(u32, String), Arc>>, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -21,6 +51,16 @@ pub enum PciManagerError { ResourceAllocation(#[from] vm_allocator::Error), /// Bus error: {0} Bus(#[from] BusError), + /// PCI root error: {0} + PciRoot(#[from] PciRootError), + /// MSI error: {0} + Msi(#[from] InterruptError), + /// VirtIO PCI device error: {0} + VirtioPciDevice(#[from] VirtioPciDeviceError), + /// PCI device error: {0} + PciDeviceError(#[from] PciDeviceError), + /// KVM error: {0} + Kvm(#[from] vmm_sys_util::errno::Error), } impl PciDevices { @@ -28,42 +68,746 @@ impl PciDevices { Default::default() } - pub fn attach_pci_segment( - &mut self, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { + pub fn attach_pci_segment(&mut self, vm: &Arc) -> Result<(), PciManagerError> { // We only support a single PCIe segment. Calling this function twice is a Firecracker // internal error. assert!(self.pci_segment.is_none()); // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, vm, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) } - pub fn save(&self) -> PciDevicesState { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), + pub(crate) fn attach_pci_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_bdf()?; + debug!("Allocating BDF: {pci_device_bdf:?} for device"); + let mem = vm.guest_memory().clone(); + let resource_allocator = &vm.common.resource_allocator; + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + // Allocate one MSI vector per queue, plus one for configuration + let msix_num = + u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); + + let msix_vectors = Arc::new(Vm::create_msix_group(vm.clone(), 0, msix_num)?); + + // Create the transport + let mut virtio_device = VirtioPciDevice::new( + id.clone(), + mem, + device, + msix_vectors, + pci_device_bdf.into(), + true, + None, + )?; + + // Allocate bars + let mut mmio32_allocator = resource_allocator + .mmio32_memory + .lock() + .expect("Poisoned lock"); + let mut mmio64_allocator = resource_allocator + .mmio64_memory + .lock() + .expect("Poisoned lock"); + + let bars = + virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + + let virtio_device = Arc::new(Mutex::new(virtio_device)); + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + + self.virtio_devices + .insert((device_type, id.clone()), virtio_device.clone()); + + for bar in &bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + vm.pio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; + #[cfg(target_arch = "aarch64")] + log::error!("pci: We do not support I/O region allocation") + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; + } + } } + + let locked_device = virtio_device.lock().expect("Poisoned lock"); + + let bar_addr = locked_device.config_bar_addr(); + for (i, queue_evt) in locked_device + .virtio_device() + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u64 = 4; + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + + Ok(()) } - pub fn restore( + fn restore_pci_device( &mut self, - state: &PciDevicesState, - resource_allocator: &Arc, + vm: &Arc, + device: Arc>, + device_id: &str, + transport_state: &VirtioPciDeviceState, + event_manager: &mut EventManager, ) -> Result<(), PciManagerError> { - if state.pci_enabled { - self.attach_pci_segment(resource_allocator)?; + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let msi_vector_group = Arc::new(MsiVectorGroup::restore( + vm.clone(), + &transport_state.msi_vector_group, + )?); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + let virtio_device = Arc::new(Mutex::new(VirtioPciDevice::new_from_state( + device_id.to_string(), + vm.guest_memory().clone(), + device.clone(), + msi_vector_group, + true, + transport_state.clone(), + )?)); + + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device( + transport_state.pci_device_bdf.device() as u32, + virtio_device.clone(), + )?; + + self.virtio_devices + .insert((device_type, device_id.to_string()), virtio_device.clone()); + + let locked_device = virtio_device.lock().expect("Poisoned lock"); + for bar in &locked_device.bar_regions { + match bar.region_type() { + PciBarRegionType::IoRegion => { + debug!( + "Inserting I/O BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + #[cfg(target_arch = "x86_64")] + vm.pio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; + #[cfg(target_arch = "aarch64")] + log::error!("pci: We do not support I/O region allocation") + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + debug!( + "Inserting MMIO BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; + } + } + } + + let bar_addr = locked_device.config_bar_addr(); + for (i, queue_evt) in locked_device + .virtio_device() + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u64 = 4; + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; } + event_manager.add_subscriber(device); Ok(()) } + + /// Gets the specified device. + pub fn get_virtio_device( + &self, + device_type: u32, + device_id: &str, + ) -> Option<&Arc>> { + self.virtio_devices + .get(&(device_type, device_id.to_string())) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioDeviceState { + /// Device identifier + pub device_id: String, + /// Device BDF + pub pci_device_bdf: u32, + /// Device state + pub device_state: T, + /// Transport state + pub transport_state: VirtioPciDeviceState, } #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct PciDevicesState { - pci_enabled: bool, + /// Whether PCI is enabled + pub pci_enabled: bool, + /// Block device states. + pub block_devices: Vec>, + /// Net device states. + pub net_devices: Vec>, + /// Vsock device state. + pub vsock_device: Option>, + /// Balloon device state. + pub balloon_device: Option>, + /// Mmds version. + pub mmds_version: Option, + /// Entropy device state. + pub entropy_device: Option>, +} + +pub struct PciDevicesConstructorArgs<'a> { + pub vm: Arc, + pub mem: &'a GuestMemoryMmap, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, + pub event_manager: &'a mut EventManager, +} + +impl<'a> Debug for PciDevicesConstructorArgs<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevicesConstructorArgs") + .field("vm", &self.vm) + .field("mem", &self.mem) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + +impl<'a> Persist<'a> for PciDevices { + type State = PciDevicesState; + type ConstructorArgs = PciDevicesConstructorArgs<'a>; + type Error = PciManagerError; + + fn save(&self) -> Self::State { + let mut state = PciDevicesState::default(); + if self.pci_segment.is_some() { + state.pci_enabled = true; + } else { + return state; + } + + for pci_dev in self.virtio_devices.values() { + let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); + let transport_state = locked_pci_dev.state(); + let virtio_dev = locked_pci_dev.virtio_device(); + let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + + let pci_device_bdf = transport_state.pci_device_bdf.into(); + + match locked_virtio_dev.device_type() { + TYPE_BALLOON => { + let balloon_device = locked_virtio_dev + .as_any() + .downcast_ref::() + .unwrap(); + + let device_state = balloon_device.save(); + + state.balloon_device = Some(VirtioDeviceState { + device_id: balloon_device.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + TYPE_BLOCK => { + let block_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if block_dev.is_vhost_user() { + warn!( + "Skipping vhost-user-block device. VhostUserBlock does not support \ + snapshotting yet" + ); + } else { + block_dev.prepare_save(); + let device_state = block_dev.save(); + state.block_devices.push(VirtioDeviceState { + device_id: block_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + } + TYPE_NET => { + let net_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if let (Some(mmds_ns), None) = + (net_dev.mmds_ns.as_ref(), state.mmds_version.as_ref()) + { + state.mmds_version = + Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); + } + net_dev.prepare_save(); + let device_state = net_dev.save(); + + state.net_devices.push(VirtioDeviceState { + device_id: net_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + TYPE_VSOCK => { + let vsock_dev = locked_virtio_dev + .as_mut_any() + // Currently, VsockUnixBackend is the only implementation of VsockBackend. + .downcast_mut::>() + .unwrap(); + + // Send Transport event to reset connections if device + // is activated. + if vsock_dev.is_activated() { + vsock_dev + .send_transport_reset_event() + .unwrap_or_else(|err| { + error!("Failed to send reset transport event: {:?}", err); + }); + } + + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock_dev.backend().save(), + frontend: vsock_dev.save(), + }; + + state.vsock_device = Some(VirtioDeviceState { + device_id: vsock_dev.id().to_string(), + pci_device_bdf, + device_state: vsock_state, + transport_state, + }); + } + TYPE_RNG => { + let rng_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = rng_dev.save(); + + state.entropy_device = Some(VirtioDeviceState { + device_id: rng_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + _ => unreachable!(), + } + } + + state + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mem = constructor_args.mem; + let mut pci_devices = PciDevices::new(); + if !state.pci_enabled { + return Ok(pci_devices); + } + + pci_devices.attach_pci_segment(&constructor_args.vm)?; + + if let Some(balloon_state) = &state.balloon_device { + let device = Arc::new(Mutex::new( + Balloon::restore( + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, + &balloon_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Balloon(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &balloon_state.device_id, + &balloon_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + for block_state in &state.block_devices { + let device = Arc::new(Mutex::new( + Block::restore( + BlockConstructorArgs { mem: mem.clone() }, + &block_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::VirtioBlock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &block_state.device_id, + &block_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + // If the snapshot has the mmds version persisted, initialise the data store with it. + if let Some(mmds_version) = &state.mmds_version { + constructor_args + .vm_resources + .set_mmds_version(mmds_version.clone().into(), constructor_args.instance_id) + .unwrap(); + } else if state + .net_devices + .iter() + .any(|dev| dev.device_state.mmds_ns.is_some()) + { + // If there's at least one network device having an mmds_ns, it means + // that we are restoring from a version that did not persist the `MmdsVersionState`. + // Init with the default. + constructor_args.vm_resources.mmds_or_default(); + } + + for net_state in &state.net_devices { + let device = Arc::new(Mutex::new( + Net::restore( + NetConstructorArgs { + mem: mem.clone(), + mmds: constructor_args + .vm_resources + .mmds + .as_ref() + // Clone the Arc reference. + .cloned(), + }, + &net_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Network(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &net_state.device_id, + &net_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(vsock_state) = &state.vsock_device { + let ctor_args = VsockUdsConstructorArgs { + cid: vsock_state.device_state.frontend.cid, + }; + let backend = + VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend).unwrap(); + let device = Arc::new(Mutex::new( + Vsock::restore( + VsockConstructorArgs { + mem: mem.clone(), + backend, + }, + &vsock_state.device_state.frontend, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Vsock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &vsock_state.device_id, + &vsock_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(entropy_state) = &state.entropy_device { + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; + + let device = Arc::new(Mutex::new( + Entropy::restore(ctor_args, &entropy_state.device_state).unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Entropy(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &entropy_state.device_id, + &entropy_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + Ok(pci_devices) + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::builder::tests::*; + use crate::device_manager; + use crate::devices::virtio::block::CacheType; + use crate::mmds::data_store::MmdsVersion; + use crate::resources::VmmConfig; + use crate::snapshot::Snapshot; + use crate::vmm_config::balloon::BalloonDeviceConfig; + use crate::vmm_config::entropy::EntropyDeviceConfig; + use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::vsock::VsockDeviceConfig; + + #[test] + fn test_device_manager_persistence() { + let mut buf = vec![0; 65536]; + // These need to survive so the restored blocks find them. + let _block_files; + let mut tmp_sock_file = TempFile::new().unwrap(); + tmp_sock_file.remove().unwrap(); + // Set up a vmm with one of each device, and get the serialized DeviceStates. + { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut cmdline = default_kernel_cmdline(); + + // Add a balloon device. + let balloon_cfg = BalloonDeviceConfig { + amount_mib: 123, + deflate_on_oom: false, + stats_polling_interval_s: 1, + }; + insert_balloon_device(&mut vmm, &mut cmdline, &mut event_manager, balloon_cfg); + // Add a block device. + let drive_id = String::from("root"); + let block_configs = vec![CustomBlockConfig::new( + drive_id, + true, + None, + true, + CacheType::Unsafe, + )]; + _block_files = + insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); + // Add a net device. + let network_interface = NetworkInterfaceConfig { + iface_id: String::from("netif"), + host_dev_name: String::from("hostname"), + guest_mac: None, + rx_rate_limiter: None, + tx_rate_limiter: None, + }; + insert_net_device_with_mmds( + &mut vmm, + &mut cmdline, + &mut event_manager, + network_interface, + MmdsVersion::V2, + ); + // Add a vsock device. + let vsock_dev_id = "vsock"; + let vsock_config = VsockDeviceConfig { + vsock_id: Some(vsock_dev_id.to_string()), + guest_cid: 3, + uds_path: tmp_sock_file.as_path().to_str().unwrap().to_string(), + }; + insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add an entropy device. + let entropy_config = EntropyDeviceConfig::default(); + insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } + + tmp_sock_file.remove().unwrap(); + + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + // Keep in mind we are re-creating here an empty DeviceManager. Restoring later on + // will create a new PciDevices manager different than vmm.pci_devices. We're doing + // this to avoid restoring the whole Vmm, since what we really need from Vmm is the Vm + // object and calling default_vmm() is the easiest way to create one. + let vmm = default_vmm(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let vm_resources = &mut VmResources::default(); + let restore_args = PciDevicesConstructorArgs { + vm: vmm.vm.clone(), + mem: vmm.vm.guest_memory(), + vm_resources, + instance_id: "microvm-id", + restored_from_file: true, + event_manager: &mut event_manager, + }; + let _restored_dev_manager = + PciDevices::restore(restore_args, &device_manager_state.pci_state).unwrap(); + + let expected_vm_resources = format!( + r#"{{ + "balloon": {{ + "amount_mib": 123, + "deflate_on_oom": false, + "stats_polling_interval_s": 1 + }}, + "drives": [ + {{ + "drive_id": "root", + "partuuid": null, + "is_root_device": true, + "cache_type": "Unsafe", + "is_read_only": true, + "path_on_host": "{}", + "rate_limiter": null, + "io_engine": "Sync", + "socket": null + }} + ], + "boot-source": {{ + "kernel_image_path": "", + "initrd_path": null, + "boot_args": null + }}, + "cpu-config": null, + "logger": null, + "machine-config": {{ + "vcpu_count": 1, + "mem_size_mib": 128, + "smt": false, + "track_dirty_pages": false, + "huge_pages": "None" + }}, + "metrics": null, + "mmds-config": {{ + "version": "V2", + "network_interfaces": [ + "netif" + ], + "ipv4_address": "169.254.169.254" + }}, + "network-interfaces": [ + {{ + "iface_id": "netif", + "host_dev_name": "hostname", + "guest_mac": null, + "rx_rate_limiter": null, + "tx_rate_limiter": null + }} + ], + "vsock": {{ + "guest_cid": 3, + "uds_path": "{}" + }}, + "entropy": {{ + "rate_limiter": null + }} +}}"#, + _block_files.last().unwrap().as_path().to_str().unwrap(), + tmp_sock_file.as_path().to_str().unwrap() + ); + + assert_eq!( + vm_resources + .mmds + .as_ref() + .unwrap() + .lock() + .unwrap() + .version(), + MmdsVersion::V2 + ); + assert_eq!( + device_manager_state.pci_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); + assert_eq!( + expected_vm_resources, + serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() + ); + } } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index e3c7d2a8475..87358181df9 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -7,15 +7,11 @@ use std::fmt::{self, Debug}; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; use log::{error, warn}; use serde::{Deserialize, Serialize}; -use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; -use super::resources::ResourceAllocator; -use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -45,12 +41,13 @@ use crate::devices::virtio::vsock::persist::{ use crate::devices::virtio::vsock::{ TYPE_VSOCK, Vsock, VsockError, VsockUnixBackend, VsockUnixBackendError, }; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::{ActivateError, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::mmds::data_store::MmdsVersion; use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::{EventManager, Vm}; /// Errors for (de)serialization of the MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -80,67 +77,17 @@ pub enum DevicePersistError { Entropy(#[from] EntropyError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), + /// Could not activate device: {0} + DeviceActivation(#[from] ActivateError), } -/// Holds the state of a balloon device connected to the MMIO space. +/// Holds the state of a MMIO VirtIO device #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBalloonState { +pub struct VirtioDeviceState { /// Device identifier. pub device_id: String, /// Device state. - pub device_state: BalloonState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a virtio block device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBlockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BlockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a net device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedNetState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: NetState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a vsock device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedVsockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: VsockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of an entropy device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedEntropyState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: EntropyState, + pub device_state: T, /// Mmio transport state. pub transport_state: MmioTransportState, /// VmmResources. @@ -189,17 +136,17 @@ pub struct DeviceStates { // State of legacy devices in MMIO space. pub legacy_devices: Vec, /// Block device states. - pub block_devices: Vec, + pub block_devices: Vec>, /// Net device states. - pub net_devices: Vec, + pub net_devices: Vec>, /// Vsock device state. - pub vsock_device: Option, + pub vsock_device: Option>, /// Balloon device state. - pub balloon_device: Option, + pub balloon_device: Option>, /// Mmds version. pub mmds_version: Option, /// Entropy device state. - pub entropy_device: Option, + pub entropy_device: Option>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -215,9 +162,8 @@ pub enum SharedDeviceType { pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -240,10 +186,10 @@ pub struct ACPIDeviceManagerState { vmgenid: Option, } +#[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, - pub vm: &'a VmFd, + pub vm: &'a Vm, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -274,7 +220,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: constructor_args.resource_allocator, + resource_allocator: &constructor_args.vm.common.resource_allocator, }, vmgenid_args, )?; @@ -312,20 +258,22 @@ impl<'a> Persist<'a> for MMIODeviceManager { let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); let transport_state = mmio_transport_locked.save(); + let device_info = device.resources; + let device_id = devid.clone(); let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { - let balloon_state = locked_device + let device_state = locked_device .as_any() .downcast_ref::() .unwrap() .save(); - states.balloon_device = Some(ConnectedBalloonState { - device_id: devid.clone(), - device_state: balloon_state, + states.balloon_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } // Both virtio-block and vhost-user-block share same device type. @@ -338,16 +286,17 @@ impl<'a> Persist<'a> for MMIODeviceManager { ); } else { block.prepare_save(); - states.block_devices.push(ConnectedBlockState { - device_id: devid.clone(), - device_state: block.save(), + let device_state = block.save(); + states.block_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, - }) + device_info, + }); } } TYPE_NET => { - let net = locked_device.as_any().downcast_ref::().unwrap(); + let net = locked_device.as_mut_any().downcast_mut::().unwrap(); if let (Some(mmds_ns), None) = (net.mmds_ns.as_ref(), states.mmds_version.as_ref()) { @@ -355,11 +304,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); } - states.net_devices.push(ConnectedNetState { - device_id: devid.clone(), - device_state: net.save(), + net.prepare_save(); + let device_state = net.save(); + states.net_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_VSOCK => { @@ -379,16 +330,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { // Save state after potential notification to the guest. This // way we save changes to the queue the notification can cause. - let vsock_state = VsockState { + let device_state = VsockState { backend: vsock.backend().save(), frontend: vsock.save(), }; - states.vsock_device = Some(ConnectedVsockState { - device_id: devid.clone(), - device_state: vsock_state, + states.vsock_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_RNG => { @@ -396,12 +347,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { .as_mut_any() .downcast_mut::() .unwrap(); + let device_state = entropy.save(); - states.entropy_device = Some(ConnectedEntropyState { - device_id: devid.clone(), - device_state: entropy.save(), + states.entropy_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } _ => unreachable!(), @@ -432,17 +384,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - dev_manager.register_mmio_serial( - vm, - constructor_args.resource_allocator, - serial, - Some(state.device_info), - )?; + dev_manager.register_mmio_serial(vm, serial, Some(state.device_info))?; } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.resource_allocator, + constructor_args.vm, rtc, Some(state.device_info), )?; @@ -451,19 +398,19 @@ impl<'a> Persist<'a> for MMIODeviceManager { } let mut restore_helper = |device: Arc>, + activated: bool, is_vhost_user: bool, as_subscriber: Arc>, id: &String, state: &MmioTransportState, - interrupt: Arc, device_info: &MMIODeviceInfo, - mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { + let interrupt = Arc::new(IrqTrigger::new()); let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), - interrupt, - device, + interrupt: interrupt.clone(), + device: device.clone(), is_vhost_user, }; let mmio_transport = Arc::new(Mutex::new( @@ -471,47 +418,30 @@ impl<'a> Persist<'a> for MMIODeviceManager { .map_err(|()| DevicePersistError::MmioTransport)?, )); - // We do not currently require exact re-allocation of IDs via - // `dev_manager.irq_allocator.allocate_id()` and currently cannot do - // this effectively as `IdAllocator` does not implement an exact - // match API. - // In the future we may require preserving `IdAllocator`'s state - // after snapshot restore so as to restore the exact interrupt IDs - // from the original device's state for implementing hot-plug. - // For now this is why we do not restore the state of the - // `IdAllocator` under `dev_manager`. - - constructor_args - .resource_allocator - .allocate_32bit_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_virtio( vm, id.clone(), - mmio_bus, MMIODevice { resources: *device_info, inner: mmio_transport, }, )?; + if activated { + device + .lock() + .expect("Poisoned lock") + .activate(mem.clone(), interrupt)?; + } + event_manager.add_subscriber(as_subscriber); Ok(()) }; if let Some(balloon_state) = &state.balloon_device { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -523,24 +453,19 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + balloon_state.device_state.virtio_state.activated, false, device, &balloon_state.device_id, &balloon_state.transport_state, - interrupt, &balloon_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } for block_state in &state.block_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { - mem: mem.clone(), - interrupt: interrupt.clone(), - }, + BlockConstructorArgs { mem: mem.clone() }, &block_state.device_state, )?)); @@ -550,13 +475,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + block_state.device_state.is_activated(), false, device, &block_state.device_id, &block_state.transport_state, - interrupt, &block_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -578,11 +502,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -599,13 +521,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + net_state.device_state.virtio_state.activated, false, device, &net_state.device_id, &net_state.transport_state, - interrupt, &net_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -615,11 +536,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -631,20 +550,18 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + vsock_state.device_state.frontend.virtio_state.activated, false, device, &vsock_state.device_id, &vsock_state.transport_state, - interrupt, &vsock_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } if let Some(entropy_state) = &state.entropy_device { - let interrupt = Arc::new(IrqTrigger::new()); - let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -657,13 +574,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + entropy_state.device_state.virtio_state.activated, false, device, &entropy_state.device_id, &entropy_state.transport_state, - interrupt, &entropy_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -678,6 +594,7 @@ mod tests { use super::*; use crate::builder::tests::*; + use crate::device_manager; use crate::devices::virtio::block::CacheType; use crate::resources::VmmConfig; use crate::snapshot::Snapshot; @@ -686,29 +603,8 @@ mod tests { use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::VsockDeviceConfig; - impl PartialEq for ConnectedBalloonState { - fn eq(&self, other: &ConnectedBalloonState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedBlockState { - fn eq(&self, other: &ConnectedBlockState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedNetState { - fn eq(&self, other: &ConnectedNetState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedVsockState { - fn eq(&self, other: &ConnectedVsockState) -> bool { + impl PartialEq for VirtioDeviceState { + fn eq(&self, other: &VirtioDeviceState) -> bool { // Actual device state equality is checked by the device's tests. self.transport_state == other.transport_state && self.device_info == other.device_info } @@ -720,6 +616,7 @@ mod tests { && self.block_devices == other.block_devices && self.net_devices == other.net_devices && self.vsock_device == other.vsock_device + && self.entropy_device == other.entropy_device } } @@ -748,11 +645,10 @@ mod tests { #[test] fn test_device_manager_persistence() { - let mut buf = vec![0; 16384]; + let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -812,19 +708,19 @@ mod tests { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let vmm = default_vmm(); - let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + vm: &vmm.vm, event_manager: &mut event_manager, - resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, }; let _restored_dev_manager = - MMIODeviceManager::restore(restore_args, &device_states).unwrap(); + MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); let expected_vm_resources = format!( r#"{{ @@ -899,7 +795,10 @@ mod tests { .version(), MmdsVersion::V2 ); - assert_eq!(device_states.mmds_version.unwrap(), MmdsVersion::V2.into()); + assert_eq!( + device_manager_state.mmio_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index df0656bfbcc..5c8d4ecbc51 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -11,9 +11,9 @@ use vm_superio::Trigger; use vmm_sys_util::eventfd::EventFd; use super::super::legacy::EventFdTrigger; -use crate::device_manager::resources::ResourceAllocator; use crate::snapshot::Persist; use crate::vstate::memory::{Bytes, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; /// Bytes of memory we allocate for VMGenID device pub const VMGENID_MEM_SIZE: u64 = 16; @@ -152,11 +152,6 @@ impl<'a> Persist<'a> for VmGenId { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - constructor_args.resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::ExactMatch(state.addr), - )?; Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) } } diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index 169ffdcba3b..c37763eab3a 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -21,8 +21,8 @@ use uuid::Uuid; use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; -use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; -use crate::device_manager::resources::ResourceAllocator; +use crate::arch::{ArchVm as Vm, PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { pub(crate) id: u16, @@ -67,28 +67,21 @@ impl std::fmt::Debug for PciSegment { } impl PciSegment { - fn build( - id: u16, - resource_allocator: &Arc, - pci_irq_slots: &[u8; 32], - ) -> Result { + fn build(id: u16, vm: &Arc, pci_irq_slots: &[u8; 32]) -> Result { let pci_root = PciRoot::new(None); - let pci_bus = Arc::new(Mutex::new(PciBus::new( - pci_root, - resource_allocator.clone(), - ))); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, vm.clone()))); let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( Arc::clone(&pci_config_mmio) as Arc, mmio_config_address, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = resource_allocator.mmio32_memory.clone(); - let mem64_allocator = resource_allocator.mmio64_memory.clone(); + let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); + let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); @@ -119,13 +112,15 @@ impl PciSegment { #[cfg(target_arch = "x86_64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + use crate::Vm; + + let mut segment = Self::build(id, vm, pci_irq_slots)?; let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); - resource_allocator.pio_bus.insert( + vm.pio_bus.insert( pci_config_io.clone(), PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, @@ -151,10 +146,10 @@ impl PciSegment { #[cfg(target_arch = "aarch64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let segment = Self::build(id, vm, pci_irq_slots)?; info!( "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", @@ -462,3 +457,100 @@ impl Aml for PciSegment { .append_aml_bytes(v) } } + +#[cfg(test)] +mod tests { + + use super::*; + use crate::arch; + use crate::builder::tests::default_vmm; + use crate::utils::u64_to_usize; + + #[test] + fn test_pci_segment_build() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + assert_eq!(pci_segment.id, 0); + assert_eq!( + pci_segment.start_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + arch::MEM_32BIT_DEVICES_SIZE - 1 + ); + assert_eq!( + pci_segment.start_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + arch::MEM_64BIT_DEVICES_SIZE - 1 + ); + assert_eq!(pci_segment.mmio_config_address, arch::PCI_MMCONFIG_START); + assert_eq!(pci_segment.proximity_domain, 0); + assert_eq!(pci_segment.pci_devices_up, 0); + assert_eq!(pci_segment.pci_devices_down, 0); + assert_eq!(pci_segment.pci_irq_slots, [0u8; 32]); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_io_bus() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; + vmm.vm.pio_bus.read(PCI_CONFIG_IO_PORT, &mut data).unwrap(); + + vmm.vm + .pio_bus + .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) + .unwrap_err(); + } + + #[test] + fn test_mmio_bus() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; + + vmm.vm + .common + .mmio_bus + .read(pci_segment.mmio_config_address, &mut data) + .unwrap(); + vmm.vm + .common + .mmio_bus + .read( + pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + &mut data, + ) + .unwrap_err(); + } + + #[test] + fn test_next_device_bdf() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + // Start checking from device id 1, since 0 is allocated to the Root port. + for dev_id in 1..32 { + let bdf = pci_segment.next_device_bdf().unwrap(); + // In our case we have a single Segment with id 0, which has + // a single bus with id 0. Also, each device of ours has a + // single function. + assert_eq!(bdf, PciBdf::new(0, 0, dev_id, 0)); + } + + // We can only have 32 devices on a segment + pci_segment.next_device_bdf().unwrap_err(); + } +} diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index a6634d07170..15ae1e26b9e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -87,7 +87,7 @@ pub struct BalloonState { stats_desc_index: Option, latest_stats: BalloonStatsState, config_space: BalloonConfigSpaceState, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -95,8 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt used from the device. - pub interrupt: Arc, pub restored_from_file: bool, } @@ -154,25 +152,18 @@ impl Persist<'_> for Balloon { actual_pages: state.config_space.actual_pages, }; - if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - if balloon.stats_enabled() { - // Restore the stats descriptor. - balloon.set_stats_desc_index(state.stats_desc_index); - - // Restart timer if needed. - let timer_state = TimerState::Periodic { - current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - }; - balloon - .stats_timer - .set_state(timer_state, SetTimeFlags::Default); - } + if state.virtio_state.activated && balloon.stats_enabled() { + // Restore the stats descriptor. + balloon.set_stats_desc_index(state.stats_desc_index); + + // Restart timer if needed. + let timer_state = TimerState::Periodic { + current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + }; + balloon + .stats_timer + .set_state(timer_state, SetTimeFlags::Default); } Ok(balloon) @@ -202,7 +193,6 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 57712a8fb3a..cb9a6471137 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -17,9 +17,17 @@ pub enum BlockState { VhostUser(VhostUserBlockState), } +impl BlockState { + pub fn is_activated(&self) -> bool { + match self { + BlockState::Virtio(virtio_block_state) => virtio_block_state.virtio_state.activated, + BlockState::VhostUser(vhost_user_block_state) => false, + } + } +} + /// Auxiliary structure for creating a device when resuming from a snapshot. #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index aa28a325e1c..bcfea7b6676 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -397,7 +397,7 @@ impl VirtioBlock { if queue.prepare_kick() { interrupt - .trigger(VirtioInterruptType::Queue(index)) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { block_metrics.event_fails.inc(); }); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 57e4a11b9c1..1c7a1bce106 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -58,7 +58,7 @@ pub struct VirtioBlockState { cache_type: CacheType, root_device: bool, disk_path: String, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, } @@ -111,15 +111,6 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; - let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), }; @@ -132,7 +123,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, - device_state, + device_state: DeviceState::Inactive, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -227,10 +218,7 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { - mem: guest_mem, - interrupt: default_interrupt(), - }, + BlockConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 083cd1bb54f..49ac1802447 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -148,7 +148,7 @@ pub trait VirtioDevice: AsAny + Send { /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index e8c0135263c..bf7a91e21f3 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::num::Wrapping; use std::ops::Deref; use std::sync::{Arc, Mutex}; @@ -930,6 +931,26 @@ impl Net { let _ = self.resume_rx(); let _ = self.process_tx(); } + + /// Prepare saving state + pub fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } impl VirtioDevice for Net { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 961b56556c8..6ef8ad842ac 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,27 +30,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - // Number of used descriptors - used_descriptors: u16, - // Number of used bytes - used_bytes: u32, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - used_descriptors: rx_buffer.used_descriptors, - used_bytes: rx_buffer.used_bytes, - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -62,8 +41,7 @@ pub struct NetState { /// The associated MMDS network stack. pub mmds_ns: Option, config_space: NetConfigSpaceState, - virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -71,8 +49,6 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt for the device. - pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -108,7 +84,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -153,25 +128,6 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; - if state.virtio_state.activated { - let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); - net.tap - .set_offload(supported_flags) - .map_err(NetPersistError::TapSetOffload)?; - - net.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors(); - net.rx_buffer.used_descriptors = state.rx_buffers_state.used_descriptors; - net.rx_buffer.used_bytes = state.rx_buffers_state.used_bytes; - } - Ok(net) } } @@ -215,7 +171,6 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 686d3ee3da3..84cb60dd59e 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -262,7 +262,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), @@ -712,6 +712,19 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + /// Resets the Virtio Queue + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 75db947c9c7..d266e259418 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -19,20 +19,13 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, } #[derive(Debug)] pub struct EntropyConstructorArgs { - mem: GuestMemoryMmap, - interrupt: Arc, -} - -impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { - Self { mem, interrupt } - } + pub mem: GuestMemoryMmap, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -72,9 +65,6 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - if state.virtio_state.activated { - entropy.set_activated(constructor_args.mem, constructor_args.interrupt); - } Ok(entropy) } @@ -99,7 +89,7 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs::new(guest_mem, default_interrupt()), + EntropyConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 9871cb0ed6e..54837694ed4 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -798,7 +798,7 @@ pub(crate) mod tests { assert_eq!(d.queue_select, 3); d.queue_select = 0; - assert_eq!(d.locked_device().queues()[0].size, 0); + assert_eq!(d.locked_device().queues()[0].size, 16); write_le_u32(&mut buf[..], 16); d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index d41ad943aa2..c16a7adbe9d 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -8,6 +8,8 @@ use vmm_sys_util::eventfd::EventFd; /// MMIO transport for VirtIO devices pub mod mmio; +/// PCI transport for VirtIO devices +pub mod pci; /// Represents the types of interrupts used by VirtIO devices #[derive(Debug, Clone)] diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs new file mode 100644 index 00000000000..6e52a1ca007 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -0,0 +1,415 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +// The standard layout for the ring is a continuous chunk of memory which looks +// like this. We assume num is a power of 2. +// +// struct vring +// { +// // The actual descriptors (16 bytes each) +// struct vring_desc desc[num]; +// +// // A ring of available descriptor heads with free-running index. +// __virtio16 avail_flags; +// __virtio16 avail_idx; +// __virtio16 available[num]; +// __virtio16 used_event_idx; +// +// // Padding to the next align boundary. +// char pad[]; +// +// // A ring of used descriptor heads with free-running index. +// __virtio16 used_flags; +// __virtio16 used_idx; +// struct vring_used_elem used[num]; +// __virtio16 avail_event_idx; +// }; +// struct vring_desc { +// __virtio64 addr; +// __virtio32 len; +// __virtio16 flags; +// __virtio16 next; +// }; +// +// struct vring_avail { +// __virtio16 flags; +// __virtio16 idx; +// __virtio16 ring[]; +// }; +// +// // u32 is used here for ids for padding reasons. +// struct vring_used_elem { +// // Index of start of used descriptor chain. +// __virtio32 id; +// // Total length of the descriptor chain which was used (written to) +// __virtio32 len; +// }; +// +// Kernel header used for this reference: include/uapi/linux/virtio_ring.h +// Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html +// +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +#[derive(Debug)] +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +#[derive(Debug)] +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new(state: VirtioPciCommonConfigState) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + pub fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read(&mut self, offset: u64, data: &mut [u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word( + offset, + LittleEndian::read_u16(data), + device.lock().unwrap().queues_mut(), + ), + 4 => self.write_common_config_dword(offset, LittleEndian::read_u32(data), device), + 8 => self.write_common_config_qword( + offset, + LittleEndian::read_u64(data), + device.lock().unwrap().queues_mut(), + ), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{offset:x}: {value:x}"); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len().try_into().unwrap(), // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = value), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + ((locked_device.avail_features() >> (self.device_feature_select * 32)) + & 0xffff_ffff) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.desc_table_address, value) + }), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.desc_table_address, value) + }), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.avail_ring_address, value) + }), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.avail_ring_address, value) + }), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.used_ring_address, value) + }), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.used_ring_address, value) + }), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs new file mode 100644 index 00000000000..c730be7f3bc --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -0,0 +1,1345 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, MsixConfigState, PciBarConfiguration, + PciBarRegionType, PciBdf, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, + PciConfigurationState, PciDevice, PciDeviceError, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; +use vm_device::dma_mapping::ExternalDmaMapping; +use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; +use vm_device::{BusDevice, PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config::{ + VirtioPciCommonConfig, VirtioPciCommonConfigState, +}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; +use crate::logger::{debug, error}; +use crate::snapshot::Persist; +use crate::utils::u64_to_usize; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; + +const DEVICE_INIT: u8 = 0x00; +const DEVICE_ACKNOWLEDGE: u8 = 0x01; +const DEVICE_DRIVER: u8 = 0x02; +const DEVICE_DRIVER_OK: u8 = 0x04; +const DEVICE_FEATURES_OK: u8 = 0x08; +const DEVICE_FAILED: u8 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from((offset & 0xffff_ffff) as u32), + length: Le32::from((length & 0xffff_ffff) as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + pub pci_device_bdf: PciBdf, + pub device_activated: bool, + pub interrupt_status: usize, + pub cap_pci_cfg_offset: usize, + pub cap_pci_cfg: Vec, + pub pci_configuration_state: PciConfigurationState, + pub pci_dev_state: VirtioPciCommonConfigState, + pub msix_state: MsixConfigState, + pub msi_vector_group: HashMap, + pub bar_configuration: Vec, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VirtioPciDeviceError { + /// Failed creating VirtioPciDevice: {0} + CreateVirtioPciDevice(#[from] anyhow::Error), + /// Error creating MSI configuration: {0} + Msi(#[from] pci::MsixError), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // BDF assigned to the device + pci_device_bdf: PciBdf, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + pub bar_regions: Vec, + + // Optional DMA handler + dma_handler: Option>, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + pci_device_bdf: u32, + use_64bit_bar: bool, + dma_handler: Option>, + ) -> Result { + let locked_device = device.lock().unwrap(); + let msix_num = msi_vectors.num_vectors(); + let num_queues = locked_device.queues().len(); + let pci_device_id = + VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(locked_device.device_type()).unwrap(); + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_num, + msi_vectors.clone(), + pci_device_bdf, + None, + )?)); + + let (class, subclass) = match locked_device.device_type() { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + let configuration = PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + None, + ); + + let common_config = VirtioPciCommonConfig::new(VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }); + let cap_pci_cfg_info = VirtioPciCfgCapInfo::default(); + + // Dropping the MutexGuard to unlock the VirtioDevice. This is required + // in the context of a restore given the device might require some + // activation, meaning it will require locking. Dropping the lock + // prevents from a subtle deadlock. + std::mem::drop(locked_device); + + let mut virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: pci_device_bdf.into(), + configuration, + common_config, + msix_config: Some(msix_config), + msix_num, + device, + device_activated: Arc::new(AtomicBool::new(false)), + interrupt_status: Arc::new(AtomicUsize::new(0)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_regions: vec![], + dma_handler, + }; + + if let Some(msix_config) = &virtio_pci_device.msix_config { + virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_pci_device.common_config.msix_config.clone(), + virtio_pci_device.common_config.msix_queues.clone(), + virtio_pci_device.interrupt_source_group.clone(), + ))); + } + + Ok(virtio_pci_device) + } + + pub fn new_from_state( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + use_64bit_bar: bool, + state: VirtioPciDeviceState, + ) -> Result { + let locked_device = device.lock().unwrap(); + let msix_num = msi_vectors.num_vectors(); + let pci_device_id = + VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(locked_device.device_type()).unwrap(); + let num_queues = locked_device.queues().len(); + + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_num, + msi_vectors.clone(), + state.pci_device_bdf.into(), + Some(state.msix_state), + )?)); + + let (class, subclass) = match locked_device.device_type() { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + let configuration = PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + Some(state.pci_configuration_state), + ); + + let common_config = VirtioPciCommonConfig::new(state.pci_dev_state); + let cap_pci_cfg_info = VirtioPciCfgCapInfo { + offset: state.cap_pci_cfg_offset, + cap: *VirtioPciCfgCap::from_slice(&state.cap_pci_cfg).unwrap(), + }; + + // Dropping the MutexGuard to unlock the VirtioDevice. This is required + // in the context of a restore given the device might require some + // activation, meaning it will require locking. Dropping the lock + // prevents from a subtle deadlock. + std::mem::drop(locked_device); + + let mut virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: state.pci_device_bdf, + configuration, + common_config, + msix_config: Some(msix_config), + msix_num, + device, + device_activated: Arc::new(AtomicBool::new(state.device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), + virtio_interrupt: None, + memory: memory.clone(), + settings_bar: 0, + use_64bit_bar, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_regions: state.bar_configuration, + dma_handler: None, + }; + + if let Some(msix_config) = &virtio_pci_device.msix_config { + virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_pci_device.common_config.msix_config.clone(), + virtio_pci_device.common_config.msix_queues.clone(), + virtio_pci_device.interrupt_source_group.clone(), + ))); + } + + if state.device_activated { + virtio_pci_device + .device + .lock() + .expect("Poisoned lock") + .activate( + memory, + virtio_pci_device.virtio_interrupt.as_ref().unwrap().clone(), + ); + } + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), + COMMON_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), + ISR_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), + DEVICE_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET.try_into().unwrap(), + NOTIFICATION_SIZE.try_into().unwrap(), + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), + settings_bar, + MSIX_PBA_BAR_OFFSET.try_into().unwrap(), + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + pub fn dma_handler(&self) -> Option<&Arc> { + self.dma_handler.as_ref() + } + + pub fn state(&self) -> VirtioPciDeviceState { + VirtioPciDeviceState { + pci_device_bdf: self.pci_device_bdf, + device_activated: self.device_activated.load(Ordering::Acquire), + interrupt_status: self.interrupt_status.load(Ordering::Acquire), + cap_pci_cfg_offset: self.cap_pci_cfg_info.offset, + cap_pci_cfg: self.cap_pci_cfg_info.cap.bytes().to_vec(), + pci_configuration_state: self.configuration.state(), + pci_dev_state: self.common_config.state(), + msix_state: self + .msix_config + .as_ref() + .unwrap() + .lock() + .expect("Poisoned lock") + .state(), + msi_vector_group: self.interrupt_source_group.save(), + bar_configuration: self.bar_regions.clone(), + } + } +} + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl std::fmt::Debug for VirtioInterruptMsix { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtioInterruptMsix") + .field("msix_config", &self.msix_config) + .field("config_vector", &self.config_vector) + .field("queues_vectors", &self.queues_vectors) + .finish() + } +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } + + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + false + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + u64_to_usize(offset) >= self.cap_pci_cfg_info.offset + && base + u64_to_usize(offset) + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + u64_to_usize(offset) - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut use_64bit_bar = self.use_64bit_bar; + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) + .unwrap() + .start(); + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) + .unwrap() + .start(); + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + let range = RangeInclusive::new(bar.addr(), bar.addr() + bar.size()).unwrap(); + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(&range); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(&range); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self + .interrupt_status + .swap(0, Ordering::AcqRel) + .try_into() + .unwrap(); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device() + .lock() + .unwrap() + .activate( + self.memory.clone(), + Arc::clone(self.virtio_interrupt.as_ref().unwrap()), + ) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + self.device_activated.store(true, Ordering::SeqCst); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device() + .lock() + .unwrap() + .queues_mut() + .iter_mut() + .for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED; + } + } + } + + None + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl BusDevice for VirtioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use event_manager::MutEventSubscriber; + use linux_loader::loader::Cmdline; + use pci::{PciBdf, PciClassCode, PciDevice, PciSubclass}; + + use super::VirtioPciDevice; + use crate::Vm; + use crate::arch::MEM_64BIT_DEVICES_START; + use crate::builder::tests::default_vmm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::device::PciVirtioSubclass; + use crate::rate_limiter::RateLimiter; + + #[test] + fn test_pci_device_config() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // For more information for the values we are checking here look into the VirtIO spec here: + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1220007 + // and PCI Header type 0 layout here: https://wiki.osdev.org/PCI#Configuration_Space + + // | 16 bits | 16 bits | + // |-----------|-----------| + // regiger 0x0: | Device ID | Vendor ID | + // + // Vendor ID of VirtIO devices is 0x1af4 + let reg0 = locked_virtio_pci_device.read_config_register(0); + assert_eq!(reg0 & 0xffff, 0x1af4); + // VirtIO PCI device IDs are in the range [0x1000, 0x107f]. (We are not using transitional + // device IDs). + let devid = reg0 >> 16; + assert!( + (0x1000..=0x107f).contains(&devid), + "Device ID check: {:#x} >= 0x1000 && {:#x} <= 0x107f", + devid, + devid + ); + + // | 16 bits | 16 bits | + // |------------|-----------| + // regiger 0x1: | Status | Command | + // We offer the capabilities list (bit 4 of status register) at offset 0x34 + let reg1 = locked_virtio_pci_device.read_config_register(1); + assert_eq!(reg1, 0x0010_0000); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x2: | Class code | Subclass | Prog IF | Revision ID | + // + // Class code: VIRTIO_PCI_VENDOR_ID for all VirtIO devices + // Subclass: PciClassCode::NetworkController for net, PciClassCode::MassStore for block + // PciClassCode::Other for everything else + // Prog IF: A register defining some programmable interface register. 0 for VirtIO devices + // Revision ID: 0x1 for modern VirtIO devices + let reg2 = locked_virtio_pci_device.read_config_register(2); + assert_eq!(reg2, 0xffff_0001); + let class_code = ((reg2 >> 24) & 0xff) as u8; + assert_eq!(class_code, PciClassCode::Other.get_register_value()); + let subclass = ((reg2 >> 16) & 0xff) as u8; + assert_eq!( + subclass, + PciVirtioSubclass::NonTransitionalBase.get_register_value() + ); + let prog_if = ((reg2 >> 8) & 0xff) as u8; + assert_eq!(prog_if, 0); + let revision_id = reg2 & 0xff; + assert_eq!(revision_id, 0x1); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x3: | BIST | Header Type | Latency timer | Cache line size | + // + // BIST: status and control for self test of PCI devices. Always 0 for VirtIO devices + // HeaderType: 0x0 for general devices + // LatencyTimer: Latency timer in units of PCI bus clocks, 0 for VirtIO + // Cache Line size: 0 for VirtIO devices + let reg3 = locked_virtio_pci_device.read_config_register(3); + assert_eq!(reg3, 0x0); + + // register 0xa: Cardbus CIS pointer + // + // We don't emulate CardBus + let reg10 = locked_virtio_pci_device.read_config_register(0xa); + assert_eq!(reg10, 0); + + // | 16 bits | 16 bits | + // regiger 0xb: | Subsystem ID | Subsystem vendor ID| + // + // For us Subsystem ID is same as device ID and subsystem vendor ID is same as vendor ID + // (reg 0x0) + let reg11 = locked_virtio_pci_device.read_config_register(0xb); + assert_eq!(reg11, reg0); + + // register 0xc: Expansion ROM base address: 0x0 for us + let reg12 = locked_virtio_pci_device.read_config_register(0xc); + assert_eq!(reg12, 0); + + // | 24 bits | 8 bits | + // register 0xd: | Reserved | Capabilities pointer | + let reg13 = locked_virtio_pci_device.read_config_register(0xd); + assert_eq!(reg13 >> 24, 0); + + // register 0xe: Reserved + let reg14 = locked_virtio_pci_device.read_config_register(0xe); + assert_eq!(reg14, 0); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0xf: | max latency | min grant | Interrupt pin | Interrupt line | + // + // We don't specify any of those + let reg15 = locked_virtio_pci_device.read_config_register(0xf); + assert_eq!(reg15, 0); + } + + #[test] + fn test_reading_bars() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // According to OSdev wiki (https://wiki.osdev.org/PCI#Configuration_Space): + // + // When you want to retrieve the actual base address of a BAR, be sure to mask the lower + // bits. For 16-bit Memory Space BARs, you calculate (BAR[x] & 0xFFF0). For 32-bit Memory + // Space BARs, you calculate (BAR[x] & 0xFFFFFFF0). For 64-bit Memory Space BARs, you + // calculate ((BAR[x] & 0xFFFFFFF0) + ((BAR[x + 1] & 0xFFFFFFFF) << 32)) For I/O Space + // BARs, you calculate (BAR[x] & 0xFFFFFFFC). + + // We are allocating a single 64-bit MMIO bar for VirtIO capabilities list. As a result, we + // are using the first two BAR registers from the configuration space. + // + // The BAR address layout is as follows: + // + // | Bits 31-4 | Bit 3 | Bits 2-1 | Bit 0 | + // | 16-Byte Aligned Base Address | Prefetchable | Type | Always 0 | + // + // For 64-bit addresses though a second BAR is used to hold the upper 32 bits + // of the address. Prefetchable and type will be help in the lower bits of the + // first bar along with the lower 32-bits of the address which is always 16-bytes + // aligned. + let bar_addr_lo = locked_virtio_pci_device.read_config_register(0x4); + let bar_addr_hi = locked_virtio_pci_device.read_config_register(0x5); + let bar_addr = bar_addr_lo as u64 + ((bar_addr_hi as u64) << 32); + + // Bit 0 always 0 + assert_eq!(bar_addr & 0x1, 0); + // Type is 0x2 meaning 64-bit BAR + assert_eq!((bar_addr & 0x6) >> 1, 2); + // The actual address of the BAR should be the first available address of our 64-bit MMIO + // region + assert_eq!(bar_addr & 0xffff_ffff_ffff_fff0, MEM_64BIT_DEVICES_START); + + // Reading the BAR size is a bit more convoluted. According to OSDev wiki: + // + // To determine the amount of address space needed by a PCI device, you must save the + // original value of the BAR, write a value of all 1's to the register, then read it back. + // The amount of memory can then be determined by masking the information bits, performing + // a bitwise NOT ('~' in C), and incrementing the value by 1. + + locked_virtio_pci_device.write_config_register(0x4, 0, &[0xff, 0xff, 0xff, 0xff]); + // Read the lower size bits and mask out the last 4 bits include Prefetchable, Type and + // hardwired-0 + let bar_size_lo = locked_virtio_pci_device.read_config_register(0x4) as u64 & 0xfffffff0; + locked_virtio_pci_device.write_config_register(0x5, 0, &[0xff, 0xff, 0xff, 0xff]); + let bar_size_hi = locked_virtio_pci_device.read_config_register(0x5) as u64; + let bar_size = !((bar_size_hi << 32) | bar_size_lo) + 1; + + // We create a capabilities BAR region of 0x80000 bytes + assert_eq!(bar_size, 0x80000); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs new file mode 100644 index 00000000000..520b52274b3 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod common_config; +pub mod device; diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 53e479ef652..4766c96edb7 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -922,10 +922,10 @@ pub(crate) mod tests { // the backend. let expected_config = VringData { index: 0, - size: 0, + size: 69, config: VringConfigData { queue_max_size: 69, - queue_size: 0, + queue_size: 69, flags: 0, desc_table_addr: guest_memory .get_host_address(queue.desc_table_address) diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 9d2fd61d9d5..6775707da3e 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -31,7 +31,7 @@ pub struct VsockState { pub struct VsockFrontendState { /// Context Identifier. pub cid: u64, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// An enum for the serializable backend state types. @@ -53,8 +53,6 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt to use for the device. - pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -123,14 +121,7 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; + vsock.device_state = DeviceState::Inactive; Ok(vsock) } } @@ -193,7 +184,6 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), - interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 01ef9547d82..4549c79857a 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -126,6 +126,7 @@ use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; +use snapshot::Persist; use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; @@ -299,8 +300,9 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, /// VM object - pub vm: Vm, + pub vm: Arc, // Save UFFD in order to keep it open in the Firecracker process, as well. + #[allow(unused)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. @@ -371,10 +373,9 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] - vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); + vcpu.kvm_vcpu.set_pio_bus(self.vm.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); @@ -388,7 +389,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.device_manager.mmio_devices.kick_devices(); + self.device_manager.kick_virtio_devices(); // Send the events. self.vcpus_handles diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 6fd5ca89081..067d1083853 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -166,21 +166,8 @@ pub fn create_snapshot( // We need to mark queues as dirty again for all activated devices. The reason we // do it here is because we don't mark pages as dirty during runtime // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. vmm.device_manager - .mmio_devices - .for_each_virtio_device(|_, _, device| { - let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); - let d = mmio_dev_locked.locked_device(); - if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) - } else { - Ok(()) - } - }) - .unwrap(); + .mark_virtio_queue_memory_dirty(vmm.vm.guest_memory()); Ok(()) } @@ -599,6 +586,7 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; + use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 47458835e04..f4fa25914d0 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -5,6 +5,8 @@ pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; +/// Resource manager for devices. +pub mod resources; /// Module with Vcpu implementation. pub mod vcpu; /// Module with Vm implementation. diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/vstate/resources.rs similarity index 55% rename from src/vmm/src/device_manager/resources.rs rename to src/vmm/src/vstate/resources.rs index 249d0507ba8..3d8d8016e97 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -1,14 +1,15 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; use std::sync::{Arc, Mutex}; -use pci::DeviceRelocation; +use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; -use vm_device::Bus; use crate::arch; +use crate::snapshot::Persist; /// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory /// @@ -19,19 +20,14 @@ use crate::arch; /// * Memory allocations in the MMIO address space #[derive(Debug)] pub struct ResourceAllocator { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, - /// MMIO bus - pub mmio_bus: Arc, - #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, } impl ResourceAllocator { @@ -51,9 +47,6 @@ impl ResourceAllocator { arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE, )?)), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } @@ -152,23 +145,74 @@ impl ResourceAllocator { } } -impl DeviceRelocation for ResourceAllocator { - fn move_bar( - &self, - _old_base: u64, - _new_base: u64, - _len: u64, - _pci_dev: &mut dyn pci::PciDevice, - _region_type: pci::PciBarRegionType, - ) -> Result<(), std::io::Error> { - todo!() +impl<'a> Persist<'a> for ResourceAllocator { + type State = ResourceAllocatorState; + type ConstructorArgs = (); + type Error = Infallible; + + fn save(&self) -> Self::State { + ResourceAllocatorState { + gsi_allocator: self.gsi_allocator.clone(), + mmio32_memory: self.mmio32_memory.clone(), + mmio64_memory: self.mmio64_memory.clone(), + system_memory: self.system_memory.clone(), + } + } + + fn restore( + _constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + Ok(ResourceAllocator { + gsi_allocator: state.gsi_allocator.clone(), + mmio32_memory: state.mmio32_memory.clone(), + mmio64_memory: state.mmio64_memory.clone(), + system_memory: state.system_memory.clone(), + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +/// State of a ResourceAllocator +pub struct ResourceAllocatorState { + /// Allocator for device interrupt lines + pub gsi_allocator: Arc>, + /// Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: Arc>, + /// Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: Arc>, + /// Memory allocator for system data + pub system_memory: Arc>, +} + +impl Default for ResourceAllocatorState { + fn default() -> Self { + Self { + gsi_allocator: Arc::new(Mutex::new( + IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + )), + mmio32_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) + .unwrap(), + )), + mmio64_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) + .unwrap(), + )), + system_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), + )), + } } } #[cfg(test)] mod tests { - use super::ResourceAllocator; - use crate::arch; + use vm_allocator::AllocPolicy; + + use super::{ResourceAllocator, ResourceAllocatorState}; + use crate::arch::{self, IRQ_BASE}; + use crate::snapshot::{Persist, Snapshot}; const MAX_IRQS: u32 = arch::IRQ_MAX - arch::IRQ_BASE + 1; @@ -210,4 +254,61 @@ mod tests { assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); } } + + fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { + let mut buf = vec![0u8; 1024]; + Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); + let restored_state: ResourceAllocatorState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + ResourceAllocator::restore((), &restored_state).unwrap() + } + + #[test] + fn test_save_restore() { + let allocator0 = ResourceAllocator::new().unwrap(); + let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_0, IRQ_BASE); + + let allocator1 = clone_allocator(&allocator0); + let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_1, IRQ_BASE + 1); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START); + + let allocator2 = clone_allocator(&allocator1); + allocator2 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) + .unwrap_err(); + allocator2 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio64_mem)) + .unwrap_err(); + allocator2 + .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) + .unwrap_err(); + + let gsi_2 = allocator2.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_2, IRQ_BASE + 2); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START + 0x42); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START + 0x42); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START + 0x42); + } } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..9af0cb75e61 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,23 +9,235 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; -use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; -use kvm_ioctls::VmFd; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, + KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, +}; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use log::debug; +use pci::{DeviceRelocation, PciBarRegionType}; +use serde::{Deserialize, Serialize}; +use vm_allocator::RangeInclusive; +use vm_device::interrupt::{InterruptSourceGroup, MsiIrqSourceConfig}; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::devices::virtio::transport::pci::device::VirtioPciDevice; use crate::logger::info; use crate::persist::CreateSnapshotError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Errors related with Firecracker interrupts +pub enum InterruptError { + /// Error allocating resources: {0} + Allocator(#[from] vm_allocator::Error), + /// EventFd error: {0} + EventFd(std::io::Error), + /// FamStruct error: {0} + FamStruct(#[from] vmm_sys_util::fam::Error), + /// KVM error: {0} + Kvm(#[from] kvm_ioctls::Error), +} + +#[derive(Debug, Serialize, Deserialize)] +/// A struct representing an interrupt line used by some device of the microVM +pub struct RoutingEntry { + entry: kvm_irq_routing_entry, + masked: bool, +} + +/// Type that describes an allocated interrupt +#[derive(Debug)] +pub struct MsiVector { + /// GSI used for this vector + pub gsi: u32, + /// EventFd used for this vector + pub event_fd: EventFd, + /// Flag determining whether the vector is enabled + pub enabled: AtomicBool, +} + +impl MsiVector { + /// Create a new [`MsiVector`] of a particular type + pub fn new(gsi: u32, enabled: bool) -> Result { + Ok(MsiVector { + gsi, + event_fd: EventFd::new(libc::EFD_NONBLOCK).map_err(InterruptError::EventFd)?, + enabled: AtomicBool::new(enabled), + }) + } +} + +impl MsiVector { + /// Enable vector + fn enable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if !self.enabled.load(Ordering::Acquire) { + vmfd.register_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(true, Ordering::Release); + } + + Ok(()) + } + + /// Disable vector + fn disable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if self.enabled.load(Ordering::Acquire) { + vmfd.unregister_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(false, Ordering::Release); + } + + Ok(()) + } +} + +#[derive(Debug)] +/// MSI interrupts created for a VirtIO device +pub struct MsiVectorGroup { + vm: Arc, + irq_routes: HashMap, +} + +impl MsiVectorGroup { + /// Returns the number of vectors in this group + pub fn num_vectors(&self) -> u16 { + u16::try_from(self.irq_routes.len()).unwrap() + } +} + +impl<'a> Persist<'a> for MsiVectorGroup { + type State = HashMap; + type ConstructorArgs = Arc; + type Error = InterruptError; + + fn save(&self) -> Self::State { + // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X + // configuration and make sure that the vector is enabled during the restore path if it was + // initially enabled + self.irq_routes + .iter() + .map(|(id, route)| (*id, route.gsi)) + .collect() + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut irq_routes = HashMap::new(); + + for (id, gsi) in state { + irq_routes.insert(*id, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { + vm: constructor_args, + irq_routes, + }) + } +} + +impl InterruptSourceGroup for MsiVectorGroup { + fn enable(&self) -> vm_device::interrupt::Result<()> { + for (_, route) in self.irq_routes.iter() { + route.enable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn disable(&self) -> vm_device::interrupt::Result<()> { + for (_, route) in self.irq_routes.iter() { + route.disable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn trigger( + &self, + index: vm_device::interrupt::InterruptIndex, + ) -> vm_device::interrupt::Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + return route.event_fd.write(1); + } + + Err(std::io::Error::other(format!( + "trigger: invalid interrupt index {index}" + ))) + } + + fn notifier(&self, index: vm_device::interrupt::InterruptIndex) -> Option<&EventFd> { + self.irq_routes.get(&index).map(|route| &route.event_fd) + } + + fn update( + &self, + index: vm_device::interrupt::InterruptIndex, + config: vm_device::interrupt::InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + let msi_config = match config { + vm_device::interrupt::InterruptSourceConfig::LegacyIrq(_) => { + return Err(std::io::Error::other( + "MSI-x update: invalid configuration type", + )); + } + vm_device::interrupt::InterruptSourceConfig::MsiIrq(config) => config, + }; + + if let Some(route) = self.irq_routes.get(&index) { + // When an interrupt is masked the GSI will not be passed to KVM through + // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file + // descriptor before passing the interrupt routes to KVM + if masked { + route.disable(&self.vm.common.fd)?; + } + + self.vm.register_msi(route, masked, msi_config)?; + if set_gsi { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}")))? + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which does not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.common.fd)?; + } + + return Ok(()); + } + + Err(std::io::Error::other(format!( + "MSI-X update: invalid vector index {index}" + ))) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}"))) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +246,12 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// Interrupts used by Vm's devices + pub interrupts: Arc>>, + /// Allocator for VM resources + pub resource_allocator: Arc, + /// MMIO bus + pub mmio_bus: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -55,6 +273,8 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// ResourceAllocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error) } /// Contains Vm functions that are usable across CPU architectures @@ -101,6 +321,9 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + interrupts: Arc::new(Mutex::new(HashMap::new())), + resource_allocator: Arc::new(ResourceAllocator::new()?), + mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -276,14 +499,232 @@ impl Vm { file.sync_all() .map_err(|err| MemoryBackingFile("sync_all", err)) } + + /// Register a device IRQ + pub fn register_irq(&self, fd: &EventFd, gsi: u32) -> Result<(), errno::Error> { + self.common.fd.register_irqfd(fd, gsi)?; + + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + #[cfg(target_arch = "x86_64")] + { + entry.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + } + #[cfg(target_arch = "aarch64")] + { + entry.u.irqchip.irqchip = 0; + } + entry.u.irqchip.pin = gsi; + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert( + gsi, + RoutingEntry { + entry, + masked: false, + }, + ); + Ok(()) + } + + /// Register an MSI device interrupt + pub fn register_msi( + &self, + route: &MsiVector, + masked: bool, + config: MsiIrqSourceConfig, + ) -> Result<(), errno::Error> { + let mut entry = kvm_irq_routing_entry { + gsi: route.gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_lo = config.low_addr; + entry.u.msi.address_hi = config.high_addr; + entry.u.msi.data = config.data; + + if self.common.fd.check_extension(kvm_ioctls::Cap::MsiDevid) { + // On AArch64, there is limitation on the range of the 'devid', + // it cannot be greater than 65536 (the max of u16). + // + // BDF cannot be used directly, because 'segment' is in high + // 16 bits. The layout of the u32 BDF is: + // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| + // | segment | bus | device | function | + // + // Now that we support 1 bus only in a segment, we can build a + // 'devid' by replacing the 'bus' bits with the low 8 bits of + // 'segment' data. + // This way we can resolve the range checking problem and give + // different `devid` to all the devices. Limitation is that at + // most 256 segments can be supported. + // + let modified_devid = ((config.devid & 0x00ff_0000) >> 8) | config.devid & 0xff; + + entry.flags = KVM_MSI_VALID_DEVID; + entry.u.msi.__bindgen_anon_1.devid = modified_devid; + } + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert(route.gsi, RoutingEntry { entry, masked }); + + Ok(()) + } + + /// Create a group of MSI-X interrupts + pub fn create_msix_group( + vm: Arc, + base: u32, + count: u16, + ) -> Result { + debug!("Creating new MSI group with {count} vectors"); + let mut irq_routes = HashMap::with_capacity(count as usize); + for (i, gsi) in vm + .common + .resource_allocator + .allocate_gsi(count as u32)? + .iter() + .enumerate() + { + irq_routes.insert( + u32::try_from(i).unwrap() + base, + MsiVector::new(*gsi, false)?, + ); + } + + Ok(MsiVectorGroup { vm, irq_routes }) + } + + /// Set GSI routes to KVM + pub fn set_gsi_routes(&self) -> Result<(), InterruptError> { + let entries = self.common.interrupts.lock().expect("Poisoned lock"); + let mut routes = KvmIrqRouting::new(0)?; + + for (_, entry) in entries.iter() { + if entry.masked { + continue; + } + routes.push(entry.entry)?; + } + + self.common.fd.set_gsi_routing(&routes)?; + Ok(()) + } +} + +impl DeviceRelocation for Vm { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn pci::PciDevice, + region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + debug!("pci: moving BAR from {old_base:#x}:{len:#x} to {new_base:#x}:{len:#x}"); + match region_type { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + // We do not allocate IO addresses, we just hard-code them, no need to handle + // (re)allocations. Just update PIO bus + self.pio_bus + .update_range(old_base, len, new_base, len) + .map_err(std::io::Error::other)?; + + #[cfg(target_arch = "aarch64")] + return Err(std::io::Error::other( + "pci: IO regions not supported on Aarch64", + )); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + let old_range = + RangeInclusive::new(old_base, old_base + len - 1).map_err(|_| { + std::io::Error::other("pci: invalid old range for device relocation") + })?; + let allocator = if region_type == PciBarRegionType::Memory32BitRegion { + &self.common.resource_allocator.mmio32_memory + } else { + &self.common.resource_allocator.mmio64_memory + }; + + allocator + .lock() + .expect("Poisoned lock") + .free(&old_range) + .map_err(|_| { + std::io::Error::other("pci: failed deallocating old MMIO range") + })?; + + allocator + .lock() + .unwrap() + .allocate(len, len, vm_allocator::AllocPolicy::ExactMatch(new_base)) + .map_err(|_| std::io::Error::other("pci: failed allocating new MMIO range"))?; + + // Update MMIO bus + self.common + .mmio_bus + .update_range(old_base, len, new_base, len) + .map_err(std::io::Error::other)?; + } + } + + let any_dev = pci_dev.as_any_mut(); + if let Some(virtio_pci_dev) = any_dev.downcast_ref::() { + let bar_addr = virtio_pci_dev.config_bar_addr(); + if bar_addr == new_base { + for (i, queue_evt) in virtio_pci_dev + .virtio_device() + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u64 = 4; + let notify_base = old_base + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER); + self.common + .fd + .unregister_ioevent(queue_evt, &io_addr, NoDatamatch)?; + + let notify_base = new_base + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER); + self.common + .fd + .register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + } + } + + pci_dev.move_bar(old_base, new_base) + } } #[cfg(test)] pub(crate) mod tests { + use std::ops::DerefMut; + + use pci::PciBdf; + use vm_allocator::AllocPolicy; + use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; use vm_memory::GuestAddress; use vm_memory::mmap::MmapRegionBuilder; use super::*; + use crate::device_manager::mmio::tests::DummyDevice; use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; @@ -391,4 +832,422 @@ pub(crate) mod tests { assert_eq!(vcpu_vec.len(), vcpu_count as usize); } + + fn enable_irqchip(vm: &mut Vm) { + #[cfg(target_arch = "x86_64")] + vm.setup_irqchip().unwrap(); + #[cfg(target_arch = "aarch64")] + vm.setup_irqchip(1).unwrap(); + } + + fn create_msix_group(vm: &Arc) -> MsiVectorGroup { + Vm::create_msix_group(vm.clone(), 0, 4).unwrap() + } + + #[test] + fn test_msi_vector_group_new() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + assert_eq!(msix_group.num_vectors(), 4); + } + + #[test] + fn test_msi_vector_group_enable_disable() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // Initially all vectors are disabled + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + + // Enable works + msix_group.enable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(route.enabled.load(Ordering::Acquire)); + } + // Enabling an enabled group doesn't error out + msix_group.enable().unwrap(); + + // Disable works + msix_group.disable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + // Disabling a disabled group doesn't error out + } + + #[test] + fn test_msi_vector_group_trigger() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // We can now trigger all vectors + for i in 0..4 { + msix_group.trigger(i).unwrap() + } + + // We can't trigger an invalid vector + msix_group.trigger(4).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_notifier() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + for i in 0..4 { + assert!(msix_group.notifier(i).is_some()); + } + + assert!(msix_group.notifier(4).is_none()); + } + + #[test] + fn test_msi_vector_group_update_wrong_config() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let irq_config = LegacyIrqSourceConfig { irqchip: 0, pin: 0 }; + msix_group + .update(0, InterruptSourceConfig::LegacyIrq(irq_config), true, true) + .unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update_invalid_vector() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let config = InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x12, + data: 0x12, + devid: 0xafa, + }); + msix_group.update(0, config, true, true).unwrap(); + msix_group.update(4, config, true, true).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + assert!(vm.common.interrupts.lock().unwrap().is_empty()); + let msix_group = create_msix_group(&vm); + + // Set some configuration for the vectors. Initially all are masked + let mut config = MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x13, + data: 0x12, + devid: 0xafa, + }; + for i in 0..4 { + config.data = 0x12 * i; + msix_group + .update(i, InterruptSourceConfig::MsiIrq(config), true, false) + .unwrap(); + } + + // All vectors should be disabled + for vector in msix_group.irq_routes.values() { + assert!(!vector.enabled.load(Ordering::Acquire)); + } + + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Simply enabling the vectors should not update the registered IRQ routes + msix_group.enable().unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Updating the config of a vector should enable its route (and only its route) + config.data = 0; + msix_group + .update(0, InterruptSourceConfig::MsiIrq(config), false, true) + .unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert_eq!(kvm_route.masked, i != 0); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_msi_vector_group_set_gsi_without_ioapic() { + // Setting GSI routes without IOAPIC setup should fail on x86. Apparently, it doesn't fail + // on Aarch64 + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let err = msix_group.set_gsi().unwrap_err(); + assert_eq!( + format!("{err}"), + "MSI-X update: KVM error: Invalid argument (os error 22)" + ); + } + + #[test] + fn test_msi_vector_group_set_gsi() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.set_gsi().unwrap(); + } + + #[test] + fn test_msi_vector_group_persistence() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.enable().unwrap(); + let state = msix_group.save(); + let restored_group = MsiVectorGroup::restore(vm, &state).unwrap(); + + assert_eq!(msix_group.num_vectors(), restored_group.num_vectors()); + // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI + // transport will make sure the correct config is set for the vectors and enable them + // accordingly. + for (id, vector) in msix_group.irq_routes { + let new_vector = restored_group.irq_routes.get(&id).unwrap(); + assert_eq!(vector.gsi, new_vector.gsi); + assert!(!new_vector.enabled.load(Ordering::Acquire)); + } + } + + fn new_virtio_pci_device(vm: &Arc) -> Arc> { + let dummy = Arc::new(Mutex::new(DummyDevice::new())); + let msi_vectors = Arc::new(Vm::create_msix_group(vm.clone(), 0, 2).unwrap()); + Arc::new(Mutex::new( + VirtioPciDevice::new( + "dummy".to_string(), + vm.guest_memory().clone(), + dummy, + msi_vectors, + PciBdf::new(0, 0, 1, 0).into(), + true, + None, + ) + .unwrap(), + )) + } + + #[cfg(target_arch = "aarch64")] + #[test] + fn test_device_relocation_no_io_on_arm() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let old_base = 0x42; + let new_base = 0x84; + let len = 0x1312000; + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + vm.move_bar(old_base, new_base, len, pci_dev, PciBarRegionType::IoRegion) + .unwrap_err(); + } + + #[test] + fn test_device_relocation_bad_ranges() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + // Old region would overflow + vm.move_bar(0, 0x12, u64::MAX, pci_dev, PciBarRegionType::IoRegion) + .unwrap_err(); + // New region would overflow + vm.move_bar(0x13, 0, u64::MAX, pci_dev, PciBarRegionType::IoRegion) + .unwrap_err(); + } + + #[test] + fn test_device_relocation_old_region_not_allocated() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + let err = vm + .move_bar( + 0x12, + 0x13, + 0x42, + pci_dev, + PciBarRegionType::Memory32BitRegion, + ) + .unwrap_err(); + assert_eq!(format!("{err}"), "pci: failed deallocating old MMIO range"); + } + + #[test] + fn test_device_relocation_new_region_allocated() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + // Allocate old range and add it to bus + let old_base = vm + .common + .resource_allocator + .allocate_32bit_mmio_memory(0x1000, 0x1000, AllocPolicy::FirstMatch) + .unwrap(); + vm.common + .mmio_bus + .insert(virtio_dev.clone(), 0x1000, 0x1000) + .unwrap(); + + // Also allocate new region. This should cause relocation to fail + let new_base = vm + .common + .resource_allocator + .allocate_32bit_mmio_memory(0x1000, 0x1000, AllocPolicy::FirstMatch) + .unwrap(); + + let err = vm + .move_bar( + old_base, + new_base, + 0x1000, + pci_dev, + PciBarRegionType::Memory32BitRegion, + ) + .unwrap_err(); + assert_eq!(format!("{err}"), "pci: failed allocating new MMIO range"); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_device_relocation_io_device() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + let err = vm + .move_bar(0x12, 0x13, 0x42000, pci_dev, PciBarRegionType::IoRegion) + .unwrap_err(); + assert_eq!(format!("{err}"), "bus_error: MissingAddressRange"); + + // If, instead, we add the device in the PIO bus, everything should work fine + vm.pio_bus + .insert(virtio_dev.clone(), 0x12, 0x42000) + .unwrap(); + + vm.move_bar(0x12, 0x13, 0x42000, pci_dev, PciBarRegionType::IoRegion) + .unwrap() + } + + #[test] + fn test_device_relocation_mmio_device() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + + let virtio_dev = new_virtio_pci_device(&vm); + let mut virtio_dev_locked = virtio_dev.lock().unwrap(); + let pci_dev = virtio_dev_locked.deref_mut(); + + let old_base = vm + .common + .resource_allocator + .allocate_64bit_mmio_memory(0x8000, 0x1000, AllocPolicy::FirstMatch) + .unwrap(); + + let err = vm + .move_bar( + old_base, + old_base + 0x8000, + 0x8000, + pci_dev, + PciBarRegionType::Memory64BitRegion, + ) + .unwrap_err(); + assert_eq!(format!("{err}"), "bus_error: MissingAddressRange"); + + // Need to reset the allocator here. Erroring out left it to a limbo state (old range is + // deallocated, new range is allocated). + vm.common + .resource_allocator + .mmio64_memory + .lock() + .unwrap() + .free(&RangeInclusive::new(old_base + 0x8000, old_base + 0x8000 + 0x8000 - 1).unwrap()) + .unwrap(); + vm.common + .resource_allocator + .allocate_64bit_mmio_memory(0x8000, 0x1000, AllocPolicy::FirstMatch) + .unwrap(); + + // If we add the device to the MMIO bus, everything should work fine + vm.common + .mmio_bus + .insert(virtio_dev.clone(), old_base, 0x8000) + .unwrap(); + + println!("old base: {old_base:#x}"); + vm.move_bar( + old_base, + old_base + 0x8000, + 0x8000, + pci_dev, + PciBarRegionType::Memory64BitRegion, + ) + .unwrap(); + } } diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index a1f46fd89c2..168069bb2d2 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -481,6 +481,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "pci": self.pci_enabled, } @property diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py index dfd0728084a..7fe9216e559 100644 --- a/tests/integration_tests/performance/test_block_ab.py +++ b/tests/integration_tests/performance/test_block_ab.py @@ -168,6 +168,7 @@ def test_block_performance( fio_mode, fio_block_size, fio_engine, + pci_enabled, io_engine, metrics, results_dir, @@ -176,7 +177,7 @@ def test_block_performance( Execute block device emulation benchmarking scenarios. """ vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() # Add a secondary block device for benchmark tests. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 7708451ec7f..4eb9a267475 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -95,12 +95,12 @@ def to_ms(v, unit): def launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, @@ -116,9 +116,11 @@ def launch_vm_with_boot_timer( return (vm, boot_time_us, cpu_boot_time_us) -def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Tests that the boot timer device works""" - launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs, 1, 128, pci_enabled + ) @pytest.mark.parametrize( @@ -127,13 +129,24 @@ def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Test boot time with different guest configurations""" for i in range(10): vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, ) if i == 0: diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py index 3355d54c2bc..4c2deba0041 100644 --- a/tests/integration_tests/performance/test_network_ab.py +++ b/tests/integration_tests/performance/test_network_ab.py @@ -38,7 +38,7 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -46,7 +46,7 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): guest_vcpus = request.param vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() vm.start()