From 17d9ae464de524293810c412a8e6ae7e213c2365 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:01:56 -0600 Subject: [PATCH 01/18] Fix docstring for BlueprintZoneImageSource --- nexus/types/src/deployment.rs | 2 +- openapi/nexus-internal.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 769f74a46f7..16502409892 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -960,7 +960,7 @@ impl fmt::Display for BlueprintZoneDisposition { } } -/// Where a blueprint's image source is located. +/// Where the zone's image source is located. /// /// This is the blueprint version of [`OmicronZoneImageSource`]. #[derive( diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a9c6d4fa074..6ce81159b8a 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2786,7 +2786,7 @@ ] }, "BlueprintZoneImageSource": { - "description": "Where a blueprint's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", + "description": "Where the zone's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", "oneOf": [ { "description": "This zone's image source is whatever happens to be on the sled's \"install\" dataset.\n\nThis is whatever was put in place at the factory or by the latest MUPdate. The image used here can vary by sled and even over time (if the sled gets MUPdated again).\n\nHistorically, this was the only source for zone images. In an system with automated control-plane-driven update we expect to only use this variant in emergencies where the system had to be recovered via MUPdate.", From a87f75524b69136b8d15ce81f8007786a40abe7a Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:03:34 -0600 Subject: [PATCH 02/18] Plumb target release TUF repo through planner --- Cargo.lock | 1 + nexus/db-queries/src/db/datastore/update.rs | 36 +++++++++++++++++++- nexus/reconfigurator/execution/src/dns.rs | 1 + nexus/reconfigurator/planning/Cargo.toml | 1 + nexus/reconfigurator/planning/src/system.rs | 4 +++ nexus/reconfigurator/preparation/src/lib.rs | 19 ++++++++++- nexus/src/app/update.rs | 2 +- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/types/src/deployment/planning_input.rs | 13 +++++++ 9 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a14b71ecddc..6b5fb5eff54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6441,6 +6441,7 @@ dependencies = [ "oxnet", "proptest", "rand 0.8.5", + "semver 1.0.25", "sled-agent-client", "slog", "slog-error-chain", diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index cd0127cf979..6a9b62f2ef0 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -22,6 +22,7 @@ use omicron_common::api::external::{ self, CreateResult, DataPageParams, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, }; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use swrite::{SWrite, swrite}; @@ -106,8 +107,41 @@ impl DataStore { }) } + /// Returns a TUF repo description. + pub async fn update_tuf_repo_get_by_id( + &self, + opctx: &OpContext, + repo_id: TypedUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use nexus_db_schema::schema::tuf_repo::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let repo_id = repo_id.into_untyped_uuid(); + let repo = dsl::tuf_repo + .filter(dsl::id.eq(repo_id)) + .select(TufRepo::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::TufRepo, + LookupType::ById(repo_id), + ), + ) + })?; + + let artifacts = artifacts_for_repo(repo.id.into(), &conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + Ok(TufRepoDescription { repo, artifacts }) + } + /// Returns the TUF repo description corresponding to this system version. - pub async fn update_tuf_repo_get( + pub async fn update_tuf_repo_get_by_version( &self, opctx: &OpContext, system_version: SemverVersion, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index ad93ee5a1bf..2c77e99145e 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1424,6 +1424,7 @@ mod test { target_crucible_pantry_zone_count: CRUCIBLE_PANTRY_REDUNDANCY, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, log, } .build() diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 695e65eb584..465c838741d 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -29,6 +29,7 @@ omicron-uuid-kinds.workspace = true once_cell.workspace = true oxnet.workspace = true rand.workspace = true +semver.workspace = true sled-agent-client.workspace = true slog.workspace = true slog-error-chain.workspace = true diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index ad389795f50..f4b947ecc59 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -44,6 +44,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::address::get_sled_address; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; @@ -96,6 +97,7 @@ pub struct SystemDescription { external_dns_version: Generation, clickhouse_policy: Option, oximeter_read_policy: OximeterReadPolicy, + tuf_repo: Option, } impl SystemDescription { @@ -175,6 +177,7 @@ impl SystemDescription { external_dns_version: Generation::new(), clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, } } @@ -427,6 +430,7 @@ impl SystemDescription { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 497d0353124..48da2704e69 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -39,6 +39,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::LookupType; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; use omicron_common::policy::COCKROACHDB_REDUNDANCY; @@ -78,6 +79,7 @@ pub struct PlanningInputFromDb<'a> { pub cockroachdb_settings: &'a CockroachDbSettings, pub clickhouse_policy: Option, pub oximeter_read_policy: OximeterReadPolicy, + pub tuf_repo: Option, pub log: &'a Logger, } @@ -140,11 +142,24 @@ impl PlanningInputFromDb<'_> { .cockroachdb_settings(opctx) .await .internal_context("fetching cockroachdb settings")?; - let clickhouse_policy = datastore .clickhouse_policy_get_latest(opctx) .await .internal_context("fetching clickhouse policy")?; + let target_release = datastore + .target_release_get_current(opctx) + .await + .internal_context("fetching current target release")?; + let tuf_repo = match target_release.tuf_repo_id { + None => None, + Some(repo_id) => Some( + datastore + .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; let oximeter_read_policy = datastore .oximeter_read_policy_get_latest(opctx) @@ -171,6 +186,7 @@ impl PlanningInputFromDb<'_> { cockroachdb_settings: &cockroachdb_settings, clickhouse_policy, oximeter_read_policy, + tuf_repo, } .build() .internal_context("assembling planning_input")?; @@ -194,6 +210,7 @@ impl PlanningInputFromDb<'_> { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index d0e1d05e5d8..5ba1b9e0a40 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -88,7 +88,7 @@ impl super::Nexus { let tuf_repo_description = self .db_datastore - .update_tuf_repo_get(opctx, system_version.into()) + .update_tuf_repo_get_by_version(opctx, system_version.into()) .await .map_err(HttpError::from)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 3b6010ff4d0..ed0089f4c8d 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6575,7 +6575,7 @@ impl NexusExternalApi for NexusExternalApiImpl { // Fetch the TUF repo metadata and update the target release. let tuf_repo_id = nexus .datastore() - .update_tuf_repo_get(&opctx, system_version.into()) + .update_tuf_repo_get_by_version(&opctx, system_version.into()) .await? .repo .id; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index d6faffd6961..e4b300b39bb 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -23,6 +23,7 @@ use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::internal::shared::SourceNatConfigError; use omicron_common::disk::DiskIdentity; use omicron_common::policy::SINGLE_NODE_CLICKHOUSE_REDUNDANCY; @@ -152,6 +153,10 @@ impl PlanningInput { .unwrap_or(0) } + pub fn tuf_repo(&self) -> Option<&TufRepoDescription> { + self.policy.tuf_repo.as_ref() + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } @@ -918,6 +923,13 @@ pub struct Policy { /// Eventually we will only allow reads from a cluster and this policy will /// no longer exist. pub oximeter_read_policy: OximeterReadPolicy, + + /// Desired system software release repository. + /// + /// New zones will use artifacts in this repo as their image sources, + /// and at most one extant zone may be modified to use it or replaced + /// with one that does. + pub tuf_repo: Option, } #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -1087,6 +1099,7 @@ impl PlanningInputBuilder { target_crucible_pantry_zone_count: 0, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), From 851128a72f8f11c1e17e959c813beb8c252449d9 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:04:01 -0600 Subject: [PATCH 03/18] Plan zone updates from TUF repo --- .../output/cmd-expunge-newly-added-stdout | 1 + nexus-sled-agent-shared/src/inventory.rs | 19 + .../planning/src/blueprint_builder/builder.rs | 65 +- nexus/reconfigurator/planning/src/planner.rs | 562 +++++++++++++++++- 4 files changed, 632 insertions(+), 15 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout index 6c0d2f67e32..c1c471f0476 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout @@ -628,6 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +INFO all zones up-to-date INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 5b8fdb39133..29c5924ed10 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -617,6 +617,25 @@ impl ZoneKind { ZoneKind::Oximeter => "oximeter", } } + + /// Return a string used as an artifact name for control-plane zones. + /// This is **not guaranteed** to be stable. + pub fn artifact_name(self) -> &'static str { + match self { + ZoneKind::BoundaryNtp => "ntp", + ZoneKind::Clickhouse => "clickhouse", + ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse", + ZoneKind::CockroachDb => "cockroachdb", + ZoneKind::Crucible => "crucible-zone", + ZoneKind::CruciblePantry => "crucible-pantry-zone", + ZoneKind::ExternalDns => "external-dns", + ZoneKind::InternalDns => "internal-dns", + ZoneKind::InternalNtp => "ntp", + ZoneKind::Nexus => "nexus", + ZoneKind::Oximeter => "oximeter", + } + } } /// Where Sled Agent should get the image for a zone. diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 864b0511845..46989d3f75a 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -33,6 +33,7 @@ use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; +use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -82,6 +83,7 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use thiserror::Error; +use tufaceous_artifact::KnownArtifactKind; use super::ClickhouseZonesThatShouldBeRunning; use super::clickhouse::ClickhouseAllocator; @@ -1159,13 +1161,14 @@ impl<'a> BlueprintBuilder<'a> { gz_address: dns_subnet.gz_address(), gz_address_index, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool: zpool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) @@ -1211,13 +1214,14 @@ impl<'a> BlueprintBuilder<'a> { dns_address, nic, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1250,13 +1254,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone)?; @@ -1402,13 +1407,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: nexus_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1427,13 +1433,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: oximeter_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1451,13 +1458,14 @@ impl<'a> BlueprintBuilder<'a> { ); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: pantry_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1485,13 +1493,14 @@ impl<'a> BlueprintBuilder<'a> { dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, }); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1511,13 +1520,14 @@ impl<'a> BlueprintBuilder<'a> { address, dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1539,13 +1549,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1567,13 +1578,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1693,6 +1705,7 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); self.sled_add_zone( sled_id, @@ -1701,7 +1714,7 @@ impl<'a> BlueprintBuilder<'a> { id: new_zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }, ) } @@ -1889,6 +1902,36 @@ impl<'a> BlueprintBuilder<'a> { ) { self.pending_mgs_updates.remove(baseboard_id); } + + /// Try to find an artifact in the release repo that contains an image + /// for a zone of the given kind. Defaults to the install dataset. + pub(crate) fn zone_image_source( + &self, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + self.input + .tuf_repo() + .and_then(|repo| { + repo.artifacts + .iter() + .find(|artifact| { + artifact + .id + .kind + .to_known() + .map(|kind| matches!(kind, KnownArtifactKind::Zone)) + .unwrap_or(false) + && artifact.id.name == zone_kind.artifact_name() + }) + .map(|artifact| BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: artifact.id.version.clone(), + }, + hash: artifact.hash, + }) + }) + .unwrap_or(BlueprintZoneImageSource::InstallDataset) + } } // Helper to validate that the system hasn't gone off the rails. There should diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 1ee0f8ddda0..58b7d8c8857 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -18,6 +18,7 @@ use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; +use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -36,6 +37,8 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; +use std::cmp::Ordering; +use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -110,14 +113,11 @@ impl<'a> Planner<'a> { } fn do_plan(&mut self) -> Result<(), Error> { - // We perform planning in two loops: the first one turns expunged sleds - // into expunged zones, and the second one adds services. - self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; + self.do_plan_zone_image_source()?; self.do_plan_cockroachdb_settings(); - Ok(()) } @@ -877,6 +877,185 @@ impl<'a> Planner<'a> { Ok(()) } + /// Update at most one existing zone to use a new image source. + fn do_plan_zone_image_source(&mut self) -> Result<(), Error> { + // We are only interested in non-decommissioned sleds. + let sleds = self + .input + .all_sleds(SledFilter::Commissioned) + .map(|(id, _details)| id) + .collect::>(); + + // Wait for all current zones to become up-to-date. + let inventory_zones = self + .inventory + .all_omicron_zones() + .map(|z| (z.id, z)) + .collect::>(); + for sled_id in sleds.iter().cloned() { + if self + .blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .any(|zone| { + inventory_zones + .get(&zone.id) + .map(|z| { + z.image_source != zone.image_source.clone().into() + }) + .unwrap_or(false) + }) + { + info!( + self.log, "zones not yet up-to-date"; + "sled_id" => %sled_id, + ); + return Ok(()); + } + } + + // Choose among the out-of-date zones one with the fewest dependencies. + let mut out_of_date_zones = sleds + .into_iter() + .flat_map(|sled_id| { + self.blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter_map(|zone| { + (zone.image_source + != self + .blueprint + .zone_image_source(zone.zone_type.kind())) + .then(|| (sled_id, zone.clone())) + }) + .collect::>() + }) + .collect::>(); + + sort_zones_by_deps(&mut out_of_date_zones); + if let Some((sled_id, zone)) = out_of_date_zones.first() { + return self.update_zone_image_source(*sled_id, zone); + } + + info!(self.log, "all zones up-to-date"); + Ok(()) + } + + /// Update a zone to use a new image source, either in-place or by + /// expunging & replacing it (depending on the kind of zone). + fn update_zone_image_source( + &mut self, + sled_id: SledUuid, + zone: &BlueprintZoneConfig, + ) -> Result<(), Error> { + let zone_kind = zone.zone_type.kind(); + let image_source = self.blueprint.zone_image_source(zone_kind); + match zone_kind { + ZoneKind::Crucible + | ZoneKind::CockroachDb + | ZoneKind::Clickhouse => { + info!( + self.log, "updating zone image source in-place"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "image_source" => %image_source, + ); + self.blueprint.sled_set_zone_source( + sled_id, + zone.id, + image_source, + )?; + } + ZoneKind::BoundaryNtp => { + info!( + self.log, "replacing out-of-date BoundaryNtp zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint + .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?; + } + ZoneKind::ClickhouseKeeper => { + info!( + self.log, "replacing out-of-date ClickhouseKeeper zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_clickhouse_keeper(sled_id)?; + } + ZoneKind::ClickhouseServer => { + info!( + self.log, "replacing out-of-date ClickhouseServer zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_clickhouse_server(sled_id)?; + } + ZoneKind::CruciblePantry => { + info!( + self.log, "replacing out-of-date CruciblePantry zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_crucible_pantry(sled_id)?; + } + ZoneKind::ExternalDns => { + info!( + self.log, "replacing out-of-date ExternalDns zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_external_dns(sled_id)?; + } + ZoneKind::InternalDns => { + info!( + self.log, "replacing out-of-date InternalDns zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_internal_dns(sled_id)?; + } + ZoneKind::InternalNtp => { + info!( + self.log, "replacing out-of-date InternalNtp zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_ensure_zone_ntp(sled_id)?; + } + ZoneKind::Nexus => { + info!( + self.log, "replacing out-of-date Nexus zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_nexus(sled_id)?; + } + ZoneKind::Oximeter => { + info!( + self.log, "replacing out-of-date Oximeter zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_oximeter(sled_id)?; + } + } + Ok(()) + } + fn do_plan_cockroachdb_settings(&mut self) { // Figure out what we should set the CockroachDB "preserve downgrade // option" setting to based on the planning input. @@ -972,6 +1151,20 @@ impl<'a> Planner<'a> { } } +/// Sort in place a vector of zone configurations by API dependencies +/// (RFD 565 §6). For now, we encode a simplified dependency graph manually; +/// down the road, we should programmatically import it from `ls-apis`. +fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { + zones.sort_by(|a, b| match (a.1.zone_type.kind(), b.1.zone_type.kind()) { + (s, t) if s == t => Ordering::Equal, + (ZoneKind::CruciblePantry, _) => Ordering::Less, + (_, ZoneKind::CruciblePantry) => Ordering::Greater, + (_, ZoneKind::Nexus) => Ordering::Less, + (ZoneKind::Nexus, _) => Ordering::Greater, + (_, _) => Ordering::Equal, + }) +} + /// The reason a sled's zones need to be expunged. /// /// This is used only for introspection and logging -- it's not part of the @@ -1000,6 +1193,8 @@ pub(crate) mod test { use nexus_types::deployment::BlueprintDiffSummary; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneImageSource; + use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; @@ -1010,16 +1205,25 @@ pub(crate) mod test { use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::external_api::views::SledState; use omicron_common::api::external::Generation; + use omicron_common::api::external::TufArtifactMeta; + use omicron_common::api::external::TufRepoDescription; + use omicron_common::api::external::TufRepoMeta; use omicron_common::disk::DatasetKind; use omicron_common::disk::DiskIdentity; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; + use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::ZpoolUuid; + use semver::Version; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::HashMap; use std::net::IpAddr; + use tufaceous_artifact::ArtifactHash; + use tufaceous_artifact::ArtifactKind; + use tufaceous_artifact::ArtifactVersion; + use tufaceous_artifact::KnownArtifactKind; use typed_rng::TypedUuidRng; // Generate a ClickhousePolicy ignoring fields we don't care about for @@ -4337,4 +4541,354 @@ pub(crate) mod test { logctx.cleanup_successful(); } + + #[test] + fn test_sort_zones_by_deps() { + static TEST_NAME: &str = "sort_zones_by_deps"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Collect zone configs from our example system. + let (_collection, _input, blueprint) = example(&log, TEST_NAME); + let mut zones = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .map(|(sled_id, z)| (sled_id, z.clone())) + .collect::>(); + + // Sort them and verify the ordering constraints. + sort_zones_by_deps(&mut zones); + let mut pantry = false; + let mut nexus = false; + for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { + match kind { + ZoneKind::CruciblePantry => { + assert!(!nexus); + pantry = true; + } + ZoneKind::Nexus => { + assert!(pantry); + nexus = true; + } + _ => { + assert!(pantry); + assert!(!nexus); + } + } + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_plan_tuf_repo() { + static TEST_NAME: &str = "plan_tuf_repo"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + let input = example.input; + let collection = example.collection; + verify_blueprint(&blueprint1); + + // We'll be manually updating our inventory collection's zones + // from blueprints. + let mut update_collection_from_blueprint = + |blueprint: &Blueprint| -> Collection { + for (&sled_id, config) in blueprint.sleds.iter() { + example + .system + .sled_set_omicron_zones( + sled_id, + config + .clone() + .into_in_service_sled_config() + .zones_config, + ) + .expect("can't set omicron zones for sled"); + } + example.system.to_collection_builder().unwrap().build() + }; + + // We should start with no specified TUF repo and nothing to do. + assert!(input.tuf_repo().is_none()); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint1, + &input, + &collection, + TEST_NAME, + ); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a trivial TUF repo. + let mut input_builder = input.into_builder(); + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: ArtifactHash([0; 32]), + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(0, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![], + }); + let input = input_builder.build(); + let blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))) + .plan() + .expect("plan for trivial TUF repo"); + + // All zones should still be sourced from the install dataset. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a TUF repo with some fake zone images. + // Only the name and kind of the artifacts matter. + let mut input_builder = input.into_builder(); + let version = ArtifactVersion::new_static("1.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + let artifacts = vec![ + TufArtifactMeta { + id: ArtifactId { + name: String::from("crucible-pantry-zone"), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + }, + TufArtifactMeta { + id: ArtifactId { + name: String::from("nexus"), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + }, + ]; + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts, + }); + + // Some helper predicates for the assertions below. + let is_up_to_date = |zone: &BlueprintZoneConfig| -> bool { + zone.image_source == image_source + }; + let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() && is_up_to_date(zone) + }; + let is_up_to_date_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() && is_up_to_date(zone) + }; + + // Add a new Nexus zone. + input_builder.policy_mut().target_nexus_zone_count = + input_builder.policy_mut().target_nexus_zone_count + 1; + + let input = input_builder.build(); + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint3", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) + .plan() + .expect("plan for non-trivial TUF repo"); + + // There should be new Nexus & Crucible Pantry zones that use the new artifact. + let summary = blueprint3.diff_since_blueprint(&blueprint2); + let mut added_count = 0; + for sled_cfg in summary.diff.sleds.modified_values_diff() { + added_count += sled_cfg.zones.added.len(); + for z in sled_cfg.zones.added.values() { + assert!(is_up_to_date_nexus(z) || is_up_to_date_pantry(z)); + } + } + assert_eq!(added_count, 2); + + // Mutate the inventory to include the updated Nexus & Pantry zones. + let collection = update_collection_from_blueprint(&blueprint3); + + // Replan with the new inventory. + let blueprint4 = Planner::new_based_on( + log.clone(), + &blueprint3, + &input, + "test_blueprint4", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp4"))) + .plan() + .expect("replan with updated inventory"); + + // Now two Crucible Pantry zones should use the new artifact. + assert_eq!( + blueprint4 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_pantry(z)) + .count(), + 2 + ); + + // One more cycle for the third Crucible Pantry to update. + let collection = update_collection_from_blueprint(&blueprint4); + let blueprint5 = Planner::new_based_on( + log.clone(), + &blueprint4, + &input, + "test_blueprint5", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp5"))) + .plan() + .expect("replan with updated inventory"); + + assert_eq!( + blueprint5 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_pantry(z)) + .count(), + CRUCIBLE_PANTRY_REDUNDANCY + ); + + // The next few planning cycles should get us up-to-date Nexus zones. + let collection = update_collection_from_blueprint(&blueprint5); + let blueprint6 = Planner::new_based_on( + log.clone(), + &blueprint5, + &input, + "test_blueprint6", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp6"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint6 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 2, + ); + + let collection = update_collection_from_blueprint(&blueprint6); + let blueprint7 = Planner::new_based_on( + log.clone(), + &blueprint6, + &input, + "test_blueprint7", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp7"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint7 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 3, + ); + + let collection = update_collection_from_blueprint(&blueprint7); + let blueprint8 = Planner::new_based_on( + log.clone(), + &blueprint7, + &input, + "test_blueprint8", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp8"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint8 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 4, + ); + + // And the last pass is just to expunge an old Nexus zone. + let collection = update_collection_from_blueprint(&blueprint8); + let blueprint9 = Planner::new_based_on( + log.clone(), + &blueprint8, + &input, + "test_blueprint9", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp9"))) + .plan() + .expect("replan with updated inventory"); + + let summary = blueprint9.diff_since_blueprint(&blueprint8); + assert_eq!(summary.diff.sleds.added.len(), 0); + assert_eq!(summary.diff.sleds.removed.len(), 0); + assert_eq!(summary.diff.sleds.modified().count(), 1); + + // Everything's up-to-date in Kansas City! + let collection = update_collection_from_blueprint(&blueprint9); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint9, + &input, + &collection, + TEST_NAME, + ); + + logctx.cleanup_successful(); + } } From 85e94bb67cd146d2e5bf210a372da0c24a958f68 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 23 Apr 2025 13:42:25 -0600 Subject: [PATCH 04/18] =?UTF-8?q?Plan=20according=20to=20RFD=20565=20?= =?UTF-8?q?=C2=A79?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/src/api/external/mod.rs | 4 + nexus-sled-agent-shared/src/inventory.rs | 20 +- .../src/db/datastore/target_release.rs | 23 +- nexus/reconfigurator/execution/src/dns.rs | 1 + .../planning/src/blueprint_builder/builder.rs | 71 ++- nexus/reconfigurator/planning/src/planner.rs | 523 +++++++++--------- .../planning/src/planner/update_sequence.rs | 108 ++++ nexus/reconfigurator/planning/src/system.rs | 3 + nexus/reconfigurator/preparation/src/lib.rs | 23 + nexus/types/src/deployment.rs | 12 + nexus/types/src/deployment/planning_input.rs | 13 +- 11 files changed, 509 insertions(+), 292 deletions(-) create mode 100644 nexus/reconfigurator/planning/src/planner/update_sequence.rs diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 60325218f1c..cc4ac7fd11d 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -748,6 +748,10 @@ impl Generation { ); Generation(next_gen) } + + pub const fn prev(&self) -> Option { + if self.0 > 1 { Some(Generation(self.0 - 1)) } else { None } + } } impl<'de> Deserialize<'de> for Generation { diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 29c5924ed10..a5f7bc3e438 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -16,6 +16,7 @@ use omicron_common::{ DatasetManagementStatus, DatasetsConfig, DiskManagementStatus, DiskVariant, OmicronPhysicalDisksConfig, }, + update::ArtifactId, zpool_name::ZpoolName, }; use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid}; @@ -26,7 +27,7 @@ use serde::{Deserialize, Serialize}; // depend on sled-hardware-types. pub use sled_hardware_types::Baseboard; use strum::EnumIter; -use tufaceous_artifact::ArtifactHash; +use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; /// Identifies information about disks which may be attached to Sleds. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] @@ -492,13 +493,14 @@ impl OmicronZoneType { /// /// # String representations of this type /// -/// There are no fewer than four string representations for this type, all +/// There are no fewer than five string representations for this type, all /// slightly different from each other. /// /// 1. [`Self::zone_prefix`]: Used to construct zone names. /// 2. [`Self::service_prefix`]: Used to construct SMF service names. /// 3. [`Self::name_prefix`]: Used to construct `Name` instances. /// 4. [`Self::report_str`]: Used for reporting and testing. +/// 5. [`Self::artifact_name`]: Used to match TUF artifact names. /// /// There is no `Display` impl to ensure that users explicitly choose the /// representation they want. (Please play close attention to this! The @@ -636,6 +638,20 @@ impl ZoneKind { ZoneKind::Oximeter => "oximeter", } } + + /// Return true if an artifact represents a control plane zone image + /// of this kind. + pub fn is_control_plane_zone_artifact( + self, + artifact_id: &ArtifactId, + ) -> bool { + artifact_id + .kind + .to_known() + .map(|kind| matches!(kind, KnownArtifactKind::Zone)) + .unwrap_or(false) + && artifact_id.name == self.artifact_name() + } } /// Where Sled Agent should get the image for a zone. diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index e98a5ce5209..e5886699ac7 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -7,7 +7,9 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; -use crate::db::model::{SemverVersion, TargetRelease, TargetReleaseSource}; +use crate::db::model::{ + Generation, SemverVersion, TargetRelease, TargetReleaseSource, +}; use async_bb8_diesel::AsyncRunQueryDsl as _; use diesel::insert_into; use diesel::prelude::*; @@ -44,6 +46,25 @@ impl DataStore { Ok(current) } + /// Fetch a target release by generation number. + pub async fn target_release_get_generation( + &self, + opctx: &OpContext, + generation: Generation, + ) -> LookupResult> { + opctx + .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) + .await?; + let conn = self.pool_connection_authorized(opctx).await?; + dsl::target_release + .select(TargetRelease::as_select()) + .filter(dsl::generation.eq(generation)) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Insert a new target release row and return it. It will only become /// the current target release if its generation is larger than any /// existing row. diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 2c77e99145e..5e226828658 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1425,6 +1425,7 @@ mod test { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, log, } .build() diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 46989d3f75a..d3f7854d089 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -14,6 +14,7 @@ use crate::blueprint_editor::ExternalSnatNetworkingChoice; use crate::blueprint_editor::NoAvailableDnsSubnets; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; +use crate::planner::OrderedComponent; use crate::planner::ZoneExpungeReason; use crate::planner::rng::PlannerRng; use anyhow::Context as _; @@ -33,7 +34,6 @@ use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; -use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -57,6 +57,7 @@ use omicron_common::address::DNS_PORT; use omicron_common::address::NTP_PORT; use omicron_common::address::ReservedRackSubnet; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::external::Vni; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; @@ -83,7 +84,6 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use thiserror::Error; -use tufaceous_artifact::KnownArtifactKind; use super::ClickhouseZonesThatShouldBeRunning; use super::clickhouse::ClickhouseAllocator; @@ -1903,34 +1903,55 @@ impl<'a> BlueprintBuilder<'a> { self.pending_mgs_updates.remove(baseboard_id); } - /// Try to find an artifact in the release repo that contains an image - /// for a zone of the given kind. Defaults to the install dataset. + fn zone_image_artifact( + repo: Option<&TufRepoDescription>, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + repo.and_then(|repo| { + repo.artifacts + .iter() + .find(|artifact| { + zone_kind.is_control_plane_zone_artifact(&artifact.id) + }) + .map(BlueprintZoneImageSource::from_available_artifact) + }) + .unwrap_or(BlueprintZoneImageSource::InstallDataset) + } + + /// Try to find an artifact in either the current or previous release repo + /// that contains an image for a zone of the given kind; see RFD 565 §9. + /// Defaults to the install dataset. pub(crate) fn zone_image_source( &self, zone_kind: ZoneKind, ) -> BlueprintZoneImageSource { - self.input - .tuf_repo() - .and_then(|repo| { - repo.artifacts - .iter() - .find(|artifact| { - artifact - .id - .kind - .to_known() - .map(|kind| matches!(kind, KnownArtifactKind::Zone)) - .unwrap_or(false) - && artifact.id.name == zone_kind.artifact_name() - }) - .map(|artifact| BlueprintZoneImageSource::Artifact { - version: BlueprintZoneImageVersion::Available { - version: artifact.id.version.clone(), - }, - hash: artifact.hash, + let new_repo = self.input.tuf_repo(); + let old_repo = self.input.old_repo(); + let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); + let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); + if let Some(prev) = OrderedComponent::from(zone_kind).prev() { + if prev >= OrderedComponent::ControlPlaneZone + && self.sled_ids_with_zones().any(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .any(|z| { + let kind = z.zone_type.kind(); + let old_artifact = + Self::zone_image_artifact(old_repo, kind); + OrderedComponent::from(kind) == prev + && z.image_source == old_artifact }) - }) - .unwrap_or(BlueprintZoneImageSource::InstallDataset) + }) + { + old_artifact + } else { + new_artifact + } + } else { + new_artifact + } } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 58b7d8c8857..dd39b53fb7c 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -37,7 +37,6 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; -use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -47,9 +46,11 @@ use self::omicron_zone_placement::OmicronZonePlacement; use self::omicron_zone_placement::OmicronZonePlacementSledState; pub use self::rng::PlannerRng; pub use self::rng::SledPlannerRng; +pub(crate) use self::update_sequence::OrderedComponent; mod omicron_zone_placement; pub(crate) mod rng; +mod update_sequence; pub struct Planner<'a> { log: Logger, @@ -116,7 +117,7 @@ impl<'a> Planner<'a> { self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; - self.do_plan_zone_image_source()?; + self.do_plan_zone_updates()?; self.do_plan_cockroachdb_settings(); Ok(()) } @@ -878,7 +879,7 @@ impl<'a> Planner<'a> { } /// Update at most one existing zone to use a new image source. - fn do_plan_zone_image_source(&mut self) -> Result<(), Error> { + fn do_plan_zone_updates(&mut self) -> Result<(), Error> { // We are only interested in non-decommissioned sleds. let sleds = self .input @@ -938,7 +939,7 @@ impl<'a> Planner<'a> { sort_zones_by_deps(&mut out_of_date_zones); if let Some((sled_id, zone)) = out_of_date_zones.first() { - return self.update_zone_image_source(*sled_id, zone); + return self.update_or_expunge_zone(*sled_id, zone); } info!(self.log, "all zones up-to-date"); @@ -946,8 +947,8 @@ impl<'a> Planner<'a> { } /// Update a zone to use a new image source, either in-place or by - /// expunging & replacing it (depending on the kind of zone). - fn update_zone_image_source( + /// expunging it and letting it be replaced in a future iteration. + fn update_or_expunge_zone( &mut self, sled_id: SledUuid, zone: &BlueprintZoneConfig, @@ -962,6 +963,7 @@ impl<'a> Planner<'a> { self.log, "updating zone image source in-place"; "sled_id" => %sled_id, "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), "image_source" => %image_source, ); self.blueprint.sled_set_zone_source( @@ -970,87 +972,14 @@ impl<'a> Planner<'a> { image_source, )?; } - ZoneKind::BoundaryNtp => { + _ => { info!( - self.log, "replacing out-of-date BoundaryNtp zone"; + self.log, "expunging out-of-date zone"; "sled_id" => %sled_id, "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), ); self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint - .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?; - } - ZoneKind::ClickhouseKeeper => { - info!( - self.log, "replacing out-of-date ClickhouseKeeper zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_clickhouse_keeper(sled_id)?; - } - ZoneKind::ClickhouseServer => { - info!( - self.log, "replacing out-of-date ClickhouseServer zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_clickhouse_server(sled_id)?; - } - ZoneKind::CruciblePantry => { - info!( - self.log, "replacing out-of-date CruciblePantry zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_crucible_pantry(sled_id)?; - } - ZoneKind::ExternalDns => { - info!( - self.log, "replacing out-of-date ExternalDns zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_external_dns(sled_id)?; - } - ZoneKind::InternalDns => { - info!( - self.log, "replacing out-of-date InternalDns zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_internal_dns(sled_id)?; - } - ZoneKind::InternalNtp => { - info!( - self.log, "replacing out-of-date InternalNtp zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_ensure_zone_ntp(sled_id)?; - } - ZoneKind::Nexus => { - info!( - self.log, "replacing out-of-date Nexus zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_nexus(sled_id)?; - } - ZoneKind::Oximeter => { - info!( - self.log, "replacing out-of-date Oximeter zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_oximeter(sled_id)?; } } Ok(()) @@ -1151,17 +1080,12 @@ impl<'a> Planner<'a> { } } -/// Sort in place a vector of zone configurations by API dependencies -/// (RFD 565 §6). For now, we encode a simplified dependency graph manually; -/// down the road, we should programmatically import it from `ls-apis`. +/// Sort in place a vector of sled-specific zone configurations by API +/// dependencies (RFD 565 §6). fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { - zones.sort_by(|a, b| match (a.1.zone_type.kind(), b.1.zone_type.kind()) { - (s, t) if s == t => Ordering::Equal, - (ZoneKind::CruciblePantry, _) => Ordering::Less, - (_, ZoneKind::CruciblePantry) => Ordering::Greater, - (_, ZoneKind::Nexus) => Ordering::Less, - (ZoneKind::Nexus, _) => Ordering::Greater, - (_, _) => Ordering::Equal, + zones.sort_by(|a, b| { + OrderedComponent::from(a.1.zone_type.kind()) + .cmp(&OrderedComponent::from(b.1.zone_type.kind())) }) } @@ -1179,6 +1103,7 @@ pub(crate) enum ZoneExpungeReason { pub(crate) mod test { use super::*; use crate::blueprint_builder::test::verify_blueprint; + use crate::example::ExampleSystem; use crate::example::ExampleSystemBuilder; use crate::example::SimRngState; use crate::example::example; @@ -1211,6 +1136,7 @@ pub(crate) mod test { use omicron_common::disk::DatasetKind; use omicron_common::disk::DiskIdentity; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; + use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PhysicalDiskUuid; @@ -4557,31 +4483,44 @@ pub(crate) mod test { // Sort them and verify the ordering constraints. sort_zones_by_deps(&mut zones); - let mut pantry = false; let mut nexus = false; for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { match kind { - ZoneKind::CruciblePantry => { - assert!(!nexus); - pantry = true; - } ZoneKind::Nexus => { - assert!(pantry); nexus = true; } _ => { - assert!(pantry); assert!(!nexus); } } } + assert!(nexus); logctx.cleanup_successful(); } + /// Manually update the example system's inventory collection's zones + /// from a blueprint. + fn update_collection_from_blueprint( + example: &mut ExampleSystem, + blueprint: &Blueprint, + ) { + for (&sled_id, config) in blueprint.sleds.iter() { + example + .system + .sled_set_omicron_zones( + sled_id, + config.clone().into_in_service_sled_config().zones_config, + ) + .expect("can't set omicron zones for sled"); + } + example.collection = + example.system.to_collection_builder().unwrap().build(); + } + #[test] - fn test_plan_tuf_repo() { - static TEST_NAME: &str = "plan_tuf_repo"; + fn test_update_crucible_pantry() { + static TEST_NAME: &str = "update_crucible_pantry"; let logctx = test_setup_log(TEST_NAME); let log = logctx.log.clone(); @@ -4592,36 +4531,15 @@ pub(crate) mod test { rng.next_system_rng(), ) .build(); - let input = example.input; - let collection = example.collection; verify_blueprint(&blueprint1); - // We'll be manually updating our inventory collection's zones - // from blueprints. - let mut update_collection_from_blueprint = - |blueprint: &Blueprint| -> Collection { - for (&sled_id, config) in blueprint.sleds.iter() { - example - .system - .sled_set_omicron_zones( - sled_id, - config - .clone() - .into_in_service_sled_config() - .zones_config, - ) - .expect("can't set omicron zones for sled"); - } - example.system.to_collection_builder().unwrap().build() - }; - // We should start with no specified TUF repo and nothing to do. - assert!(input.tuf_repo().is_none()); + assert!(example.input.tuf_repo().is_none()); assert_planning_makes_no_changes( &logctx.log, &blueprint1, - &input, - &collection, + &example.input, + &example.collection, TEST_NAME, ); @@ -4636,7 +4554,7 @@ pub(crate) mod test { ); // Manually specify a trivial TUF repo. - let mut input_builder = input.into_builder(); + let mut input_builder = example.input.clone().into_builder(); input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { repo: TufRepoMeta { hash: ArtifactHash([0; 32]), @@ -4653,7 +4571,7 @@ pub(crate) mod test { &blueprint1, &input, "test_blueprint2", - &collection, + &example.collection, ) .expect("can't create planner") .with_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))) @@ -4662,7 +4580,7 @@ pub(crate) mod test { // All zones should still be sourced from the install dataset. assert!( - blueprint1 + blueprint2 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( z.image_source, @@ -4670,8 +4588,9 @@ pub(crate) mod test { )) ); - // Manually specify a TUF repo with some fake zone images. - // Only the name and kind of the artifacts matter. + // Manually specify a TUF repo with fake zone images for Crucible Pantry + // and Nexus. Only the name and kind of the artifacts matter. The Nexus + // artifact is only there to make sure the planner *doesn't* use it. let mut input_builder = input.into_builder(); let version = ArtifactVersion::new_static("1.0.0-freeform") .expect("can't parse artifact version"); @@ -4714,181 +4633,259 @@ pub(crate) mod test { }); // Some helper predicates for the assertions below. - let is_up_to_date = |zone: &BlueprintZoneConfig| -> bool { - zone.image_source == image_source + let is_old_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) }; - let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { - zone.zone_type.is_nexus() && is_up_to_date(zone) + let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) }; let is_up_to_date_pantry = |zone: &BlueprintZoneConfig| -> bool { - zone.zone_type.is_crucible_pantry() && is_up_to_date(zone) + zone.zone_type.is_crucible_pantry() + && zone.image_source == image_source }; - // Add a new Nexus zone. + // Request another Nexus zone. This should *not* use the new artifact, + // since not all of its dependencies can be updated. input_builder.policy_mut().target_nexus_zone_count = input_builder.policy_mut().target_nexus_zone_count + 1; - let input = input_builder.build(); - let blueprint3 = Planner::new_based_on( - log.clone(), - &blueprint1, - &input, - "test_blueprint3", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) - .plan() - .expect("plan for non-trivial TUF repo"); - // There should be new Nexus & Crucible Pantry zones that use the new artifact. - let summary = blueprint3.diff_since_blueprint(&blueprint2); - let mut added_count = 0; - for sled_cfg in summary.diff.sleds.modified_values_diff() { - added_count += sled_cfg.zones.added.len(); - for z in sled_cfg.zones.added.values() { - assert!(is_up_to_date_nexus(z) || is_up_to_date_pantry(z)); - } + // We should now have three sets of expunge/add iterations for the + // Crucible Pantry zones. + let mut parent = blueprint2; + for i in 3..=8 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + parent = blueprint; } - assert_eq!(added_count, 2); - - // Mutate the inventory to include the updated Nexus & Pantry zones. - let collection = update_collection_from_blueprint(&blueprint3); - // Replan with the new inventory. - let blueprint4 = Planner::new_based_on( - log.clone(), - &blueprint3, - &input, - "test_blueprint4", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp4"))) - .plan() - .expect("replan with updated inventory"); - - // Now two Crucible Pantry zones should use the new artifact. + // All Crucible Pantries should now be updated. assert_eq!( - blueprint4 + parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), - 2 + CRUCIBLE_PANTRY_REDUNDANCY ); - // One more cycle for the third Crucible Pantry to update. - let collection = update_collection_from_blueprint(&blueprint4); - let blueprint5 = Planner::new_based_on( + // One more iteration for the last old zone to be expunged. + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( log.clone(), - &blueprint4, + &parent, &input, - "test_blueprint5", - &collection, + "last_blueprint", + &example.collection, ) .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp5"))) + .with_rng(PlannerRng::from_seed((TEST_NAME, "last_bp"))) .plan() - .expect("replan with updated inventory"); + .expect("replan for last blueprint"); assert_eq!( - blueprint5 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_pantry(z)) + blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // The next few planning cycles should get us up-to-date Nexus zones. - let collection = update_collection_from_blueprint(&blueprint5); - let blueprint6 = Planner::new_based_on( - log.clone(), - &blueprint5, - &input, - "test_blueprint6", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp6"))) - .plan() - .expect("replan with updated inventory"); + // We won't be able to update Nexus until *all* of its dependent zones + // are updated. Stay tuned for the next test! assert_eq!( - blueprint6 + parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) + .filter(|(_, z)| is_old_nexus(z)) .count(), - 2, + NEXUS_REDUNDANCY + 1, ); - let collection = update_collection_from_blueprint(&blueprint6); - let blueprint7 = Planner::new_based_on( - log.clone(), - &blueprint6, + // Everything's up-to-date in Kansas City! + update_collection_from_blueprint(&mut example, &blueprint); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint, &input, - "test_blueprint7", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp7"))) - .plan() - .expect("replan with updated inventory"); - assert_eq!( - blueprint7 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) - .count(), - 3, + &example.collection, + TEST_NAME, ); - let collection = update_collection_from_blueprint(&blueprint7); - let blueprint8 = Planner::new_based_on( - log.clone(), - &blueprint7, - &input, - "test_blueprint8", - &collection, + logctx.cleanup_successful(); + } + + #[test] + fn test_update_all_zones() { + static TEST_NAME: &str = "update_all_zones"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp8"))) - .plan() - .expect("replan with updated inventory"); - assert_eq!( - blueprint8 + .build(); + verify_blueprint(&blueprint1); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) - .count(), - 4, + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) ); - // And the last pass is just to expunge an old Nexus zone. - let collection = update_collection_from_blueprint(&blueprint8); - let blueprint9 = Planner::new_based_on( - log.clone(), - &blueprint8, - &input, - "test_blueprint9", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp9"))) - .plan() - .expect("replan with updated inventory"); + // Manually specify a TUF repo with fake images for all zones. + // Only the name and kind of the artifacts matter. + let mut input_builder = example.input.clone().into_builder(); + let version = ArtifactVersion::new_static("2.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + macro_rules! zone_artifact { + ($name:expr) => { + TufArtifactMeta { + id: ArtifactId { + name: String::from($name), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + } + }; + } + let tuf_repo = TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![ + zone_artifact!("ntp"), + zone_artifact!("clickhouse"), + zone_artifact!("clickhouse_keeper"), + zone_artifact!("cockroachdb"), + zone_artifact!("crucible-zone"), + zone_artifact!("crucible-pantry-zone"), + zone_artifact!("external-dns"), + zone_artifact!("internal-dns"), + zone_artifact!("ntp"), + zone_artifact!("nexus"), + zone_artifact!("oximeter"), + ], + }; + input_builder.policy_mut().tuf_repo = Some(tuf_repo); + let input = input_builder.build(); - let summary = blueprint9.diff_since_blueprint(&blueprint8); - assert_eq!(summary.diff.sleds.added.len(), 0); - assert_eq!(summary.diff.sleds.removed.len(), 0); - assert_eq!(summary.diff.sleds.modified().count(), 1); + let mut comp = OrderedComponent::ControlPlaneZone; + let mut parent = blueprint1; + const MAX_PLANNING_ITERATIONS: usize = 100; + for i in 2..=MAX_PLANNING_ITERATIONS { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); - // Everything's up-to-date in Kansas City! - let collection = update_collection_from_blueprint(&blueprint9); - assert_planning_makes_no_changes( - &logctx.log, - &blueprint9, - &input, - &collection, - TEST_NAME, - ); + let summary = blueprint.diff_since_blueprint(&parent); + if summary.total_zones_added() == 0 + && summary.total_zones_removed() == 0 + && summary.total_zones_modified() == 0 + { + println!("planning converged after {i} iterations"); + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| zone.image_source == image_source) + ); + logctx.cleanup_successful(); + return; + } - logctx.cleanup_successful(); + match comp { + OrderedComponent::HostOs | OrderedComponent::SpRot => { + unreachable!("can only update zones") + } + OrderedComponent::ControlPlaneZone => { + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| match zone.zone_type.kind() { + ZoneKind::Nexus => { + if zone.image_source == image_source { + comp = comp.next().unwrap(); + true + } else { + zone.image_source + == BlueprintZoneImageSource::InstallDataset + } + } + _ => zone.image_source + == BlueprintZoneImageSource::InstallDataset + || zone.image_source == image_source, + }) + ); + } + OrderedComponent::NexusZone => { + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| match zone.zone_type.kind() { + ZoneKind::Nexus => zone.image_source + == BlueprintZoneImageSource::InstallDataset + || zone.image_source == image_source, + _ => zone.image_source == image_source, + }) + ); + } + } + + parent = blueprint; + } + panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } } diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs new file mode 100644 index 00000000000..b675dd3dec8 --- /dev/null +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -0,0 +1,108 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Updatable components ordered by dependencies (RFD 565). + +use nexus_sled_agent_shared::inventory::ZoneKind; +use strum::{EnumIter, IntoEnumIterator as _}; + +/// Update sequence as defined by RFD 565 §6. +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter, PartialOrd, Ord, +)] +pub enum OrderedComponent { + HostOs, + SpRot, + ControlPlaneZone, + NexusZone, +} + +/// To implement the checks in RFD 565 §9, we need to go backwards +/// (and mabye forwards) through the component ordering. The simple +/// linear scans here are fine as long as `OrderedComponent` has only +/// a few variants; if it starts getting large (e.g., programatically +/// generated from `ls-apis`), we can switch to something faster. +impl OrderedComponent { + #[allow(dead_code)] + pub fn next(&self) -> Option { + let mut found = false; + for comp in OrderedComponent::iter() { + if found { + return Some(comp); + } else if comp == *self { + found = true; + } + } + None + } + + pub fn prev(&self) -> Option { + let mut prev = None; + for comp in OrderedComponent::iter() { + if comp == *self { + return prev; + } + prev = Some(comp); + } + prev + } +} + +impl From for OrderedComponent { + fn from(zone_kind: ZoneKind) -> Self { + match zone_kind { + ZoneKind::Nexus => Self::NexusZone, + _ => Self::ControlPlaneZone, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn ordered_component_next_prev() { + // Exhaustive checks ok for a few variants, revisit if it grows. + assert_eq!(OrderedComponent::HostOs.prev(), None); + assert_eq!( + OrderedComponent::HostOs.next(), + Some(OrderedComponent::SpRot) + ); + assert_eq!( + OrderedComponent::SpRot.prev(), + Some(OrderedComponent::HostOs) + ); + assert_eq!( + OrderedComponent::SpRot.next(), + Some(OrderedComponent::ControlPlaneZone) + ); + assert_eq!( + OrderedComponent::ControlPlaneZone.prev(), + Some(OrderedComponent::SpRot) + ); + assert_eq!( + OrderedComponent::ControlPlaneZone.next(), + Some(OrderedComponent::NexusZone) + ); + assert_eq!( + OrderedComponent::NexusZone.prev(), + Some(OrderedComponent::ControlPlaneZone) + ); + assert_eq!(OrderedComponent::NexusZone.next(), None); + assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); + } + + #[test] + fn ordered_component_from_zone_kind() { + assert!(matches!( + OrderedComponent::from(ZoneKind::CruciblePantry), + OrderedComponent::ControlPlaneZone + )); + assert!(matches!( + OrderedComponent::from(ZoneKind::Nexus), + OrderedComponent::NexusZone + )); + } +} diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index f4b947ecc59..de3e93e821d 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -98,6 +98,7 @@ pub struct SystemDescription { clickhouse_policy: Option, oximeter_read_policy: OximeterReadPolicy, tuf_repo: Option, + old_repo: Option, } impl SystemDescription { @@ -178,6 +179,7 @@ impl SystemDescription { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, } } @@ -431,6 +433,7 @@ impl SystemDescription { clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 48da2704e69..dea9e9b7791 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -7,6 +7,7 @@ use anyhow::Context; use futures::StreamExt; use nexus_db_model::DnsGroup; +use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::DataStoreDnsTest; @@ -80,6 +81,7 @@ pub struct PlanningInputFromDb<'a> { pub clickhouse_policy: Option, pub oximeter_read_policy: OximeterReadPolicy, pub tuf_repo: Option, + pub old_repo: Option, pub log: &'a Logger, } @@ -160,6 +162,25 @@ impl PlanningInputFromDb<'_> { .into_external(), ), }; + let prev_release = if let Some(prev) = target_release.generation.prev() + { + datastore + .target_release_get_generation(opctx, Generation(prev)) + .await + .internal_context("fetching current target release")? + } else { + None + }; + let old_repo = match prev_release.and_then(|r| r.tuf_repo_id) { + None => None, + Some(repo_id) => Some( + datastore + .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; let oximeter_read_policy = datastore .oximeter_read_policy_get_latest(opctx) @@ -187,6 +208,7 @@ impl PlanningInputFromDb<'_> { clickhouse_policy, oximeter_read_policy, tuf_repo, + old_repo, } .build() .internal_context("assembling planning_input")?; @@ -211,6 +233,7 @@ impl PlanningInputFromDb<'_> { clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 16502409892..8eadfa9f01b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -27,6 +27,7 @@ use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::ZoneKind; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufArtifactMeta; use omicron_common::api::internal::shared::DatasetKind; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DatasetConfig; @@ -999,6 +1000,17 @@ pub enum BlueprintZoneImageSource { Artifact { version: BlueprintZoneImageVersion, hash: ArtifactHash }, } +impl BlueprintZoneImageSource { + pub fn from_available_artifact(artifact: &TufArtifactMeta) -> Self { + BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: artifact.id.version.clone(), + }, + hash: artifact.hash, + } + } +} + impl From for OmicronZoneImageSource { fn from(source: BlueprintZoneImageSource) -> Self { match source { diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index e4b300b39bb..86e9fb02d94 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -157,6 +157,10 @@ impl PlanningInput { self.policy.tuf_repo.as_ref() } + pub fn old_repo(&self) -> Option<&TufRepoDescription> { + self.policy.old_repo.as_ref() + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } @@ -926,10 +930,16 @@ pub struct Policy { /// Desired system software release repository. /// - /// New zones will use artifacts in this repo as their image sources, + /// New zones may use artifacts in this repo as their image sources, /// and at most one extant zone may be modified to use it or replaced /// with one that does. pub tuf_repo: Option, + + /// Previous system software release repository. + /// + /// New zones deployed mid-update may use artifacts in this repo as + /// their image sources. See RFD 565 §9. + pub old_repo: Option, } #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -1100,6 +1110,7 @@ impl PlanningInputBuilder { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), From 194a8a3d3631acec4a52080333ed94694294adb3 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Mon, 28 Apr 2025 10:01:51 -0600 Subject: [PATCH 05/18] Don't update an already up-to-date zone --- nexus/reconfigurator/planning/src/planner.rs | 61 ++++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index dd39b53fb7c..98d1c1b7f98 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -955,31 +955,42 @@ impl<'a> Planner<'a> { ) -> Result<(), Error> { let zone_kind = zone.zone_type.kind(); let image_source = self.blueprint.zone_image_source(zone_kind); - match zone_kind { - ZoneKind::Crucible - | ZoneKind::CockroachDb - | ZoneKind::Clickhouse => { - info!( - self.log, "updating zone image source in-place"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - "image_source" => %image_source, - ); - self.blueprint.sled_set_zone_source( - sled_id, - zone.id, - image_source, - )?; - } - _ => { - info!( - self.log, "expunging out-of-date zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + if zone.image_source == image_source { + // This should only happen in the event of a planning error above. + warn!( + self.log, "zone is already up-to-date"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + } else { + match zone_kind { + ZoneKind::Crucible + | ZoneKind::CockroachDb + | ZoneKind::Clickhouse => { + info!( + self.log, "updating zone image source in-place"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + self.blueprint.sled_set_zone_source( + sled_id, + zone.id, + image_source, + )?; + } + _ => { + info!( + self.log, "expunging out-of-date zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + } } } Ok(()) From 50c116974728de6371106fd532d9884dfb8039a6 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Thu, 1 May 2025 20:33:37 -0600 Subject: [PATCH 06/18] Don't trust inventory zones' image_source --- nexus/db-model/src/inventory.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 16 ++++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 71ebe4e145d..e76bd4be617 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1632,6 +1632,7 @@ impl InvOmicronZone { .filesystem_pool .map(|id| ZpoolName::new_external(id.into())), zone_type, + // FIXME image_source: OmicronZoneImageSource::InstallDataset, }) } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 98d1c1b7f98..b50db73eca3 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -887,27 +887,23 @@ impl<'a> Planner<'a> { .map(|(id, _details)| id) .collect::>(); - // Wait for all current zones to become up-to-date. + // Wait for all current zones to appear in the inventory. + // It would be nice if we could check their image source, + // but the inventory doesn't report that correctly: see + // . let inventory_zones = self .inventory .all_omicron_zones() .map(|z| (z.id, z)) .collect::>(); for sled_id in sleds.iter().cloned() { - if self + if !self .blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .any(|zone| { - inventory_zones - .get(&zone.id) - .map(|z| { - z.image_source != zone.image_source.clone().into() - }) - .unwrap_or(false) - }) + .all(|zone| inventory_zones.contains_key(&zone.id)) { info!( self.log, "zones not yet up-to-date"; From ad7103e76316cb5c5507afd03b58feec0d3d536d Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 3 May 2025 02:55:30 +0000 Subject: [PATCH 07/18] Fix failing tests --- .../output/cmd-expunge-newly-added-stdout | 2 +- nexus/reconfigurator/planning/src/planner.rs | 84 ++++++++++++++++--- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout index c1c471f0476..1e1b1fe879d 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout @@ -628,7 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 -INFO all zones up-to-date +INFO zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index b50db73eca3..8ca2810b0f3 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4542,13 +4542,13 @@ pub(crate) mod test { // We should start with no specified TUF repo and nothing to do. assert!(example.input.tuf_repo().is_none()); - assert_planning_makes_no_changes( - &logctx.log, - &blueprint1, - &example.input, - &example.collection, - TEST_NAME, - ); + // assert_planning_makes_no_changes( + // &logctx.log, + // &blueprint1, + // &example.input, + // &example.collection, + // TEST_NAME, + // ); // All zones should be sourced from the install dataset by default. assert!( @@ -4659,16 +4659,46 @@ pub(crate) mod test { && zone.image_source == image_source }; - // Request another Nexus zone. This should *not* use the new artifact, - // since not all of its dependencies can be updated. + // Request another Nexus zone. input_builder.policy_mut().target_nexus_zone_count = input_builder.policy_mut().target_nexus_zone_count + 1; let input = input_builder.build(); + // Check that there is a new nexus zone that does *not* use the new + // artifact (since not all of its dependencies are updated yet). + update_collection_from_blueprint(&mut example, &blueprint2); + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) + .plan() + .expect("can't re-plan for new Nexus zone"); + { + let summary = blueprint3.diff_since_blueprint(&blueprint2); + for sled in summary.diff.sleds.modified_values_diff() { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert!(matches!( + &added.image_source, + BlueprintZoneImageSource::InstallDataset + )); + } + } + // We should now have three sets of expunge/add iterations for the // Crucible Pantry zones. - let mut parent = blueprint2; - for i in 3..=8 { + let mut parent = blueprint3; + for i in 4..=9 { let blueprint_name = format!("blueprint_{i}"); update_collection_from_blueprint(&mut example, &parent); let blueprint = Planner::new_based_on( @@ -4682,6 +4712,38 @@ pub(crate) mod test { .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) .plan() .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + assert_eq!( + sled.zones + .common + .iter() + .filter(|(_, z)| matches!( + z.after.zone_type, + BlueprintZoneType::CruciblePantry(_) + ) && matches!( + z.after.disposition, + BlueprintZoneDisposition::Expunged { .. } + )) + .count(), + 1 + ); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::CruciblePantry(_) + )); + assert_eq!(added.image_source, image_source); + } + } + parent = blueprint; } From 55b8e0ef597676861f4edf1b6410066a481e528f Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:00:11 -0600 Subject: [PATCH 08/18] =?UTF-8?q?Rename=20datastore=20methods:=20update=5F?= =?UTF-8?q?tuf=5F*=20=E2=86=92=20tuf=5F*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nexus/db-queries/src/db/datastore/deployment.rs | 2 +- nexus/db-queries/src/db/datastore/target_release.rs | 2 +- nexus/db-queries/src/db/datastore/update.rs | 10 +++++----- nexus/reconfigurator/preparation/src/lib.rs | 4 ++-- .../app/background/tasks/tuf_artifact_replication.rs | 9 ++------- nexus/src/app/update.rs | 4 ++-- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/tests/integration_tests/updates.rs | 8 ++++---- 8 files changed, 18 insertions(+), 23 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index f4af7a58a10..3349c9a20c7 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -2243,7 +2243,7 @@ mod tests { const SYSTEM_HASH: ArtifactHash = ArtifactHash([3; 32]); datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index e5886699ac7..a6f82edff99 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -210,7 +210,7 @@ mod test { .parse() .expect("SHA256('')"); let repo = datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 6a9b62f2ef0..c27d44d797a 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -76,7 +76,7 @@ impl DataStore { /// `TufRepoDescription` if one was already found. (This is not an upsert, /// because if we know about an existing repo but with different contents, /// we reject that.) - pub async fn update_tuf_repo_insert( + pub async fn tuf_repo_insert( &self, opctx: &OpContext, description: &external::TufRepoDescription, @@ -108,7 +108,7 @@ impl DataStore { } /// Returns a TUF repo description. - pub async fn update_tuf_repo_get_by_id( + pub async fn tuf_repo_get_by_id( &self, opctx: &OpContext, repo_id: TypedUuid, @@ -141,7 +141,7 @@ impl DataStore { } /// Returns the TUF repo description corresponding to this system version. - pub async fn update_tuf_repo_get_by_version( + pub async fn tuf_repo_get_by_version( &self, opctx: &OpContext, system_version: SemverVersion, @@ -174,7 +174,7 @@ impl DataStore { } /// Returns the list of all TUF repo artifacts known to the system. - pub async fn update_tuf_artifact_list( + pub async fn tuf_list_repos( &self, opctx: &OpContext, generation: Generation, @@ -194,7 +194,7 @@ impl DataStore { } /// Returns the current TUF repo generation number. - pub async fn update_tuf_generation_get( + pub async fn tuf_get_generation( &self, opctx: &OpContext, ) -> LookupResult { diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index dea9e9b7791..59e6a8ab6aa 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -156,7 +156,7 @@ impl PlanningInputFromDb<'_> { None => None, Some(repo_id) => Some( datastore - .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .tuf_repo_get_by_id(opctx, repo_id.into()) .await .internal_context("fetching target release repo")? .into_external(), @@ -175,7 +175,7 @@ impl PlanningInputFromDb<'_> { None => None, Some(repo_id) => Some( datastore - .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .tuf_repo_get_by_id(opctx, repo_id.into()) .await .internal_context("fetching target release repo")? .into_external(), diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index 0ede39b6bdc..4f7fd5dd70d 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -590,18 +590,13 @@ impl ArtifactReplication { &self, opctx: &OpContext, ) -> Result<(ArtifactConfig, Inventory)> { - let generation = - self.datastore.update_tuf_generation_get(opctx).await?; + let generation = self.datastore.tuf_get_generation(opctx).await?; let mut inventory = Inventory::default(); let mut paginator = Paginator::new(SQL_BATCH_SIZE); while let Some(p) = paginator.next() { let batch = self .datastore - .update_tuf_artifact_list( - opctx, - generation, - &p.current_pagparams(), - ) + .tuf_list_repos(opctx, generation, &p.current_pagparams()) .await?; paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); for artifact in batch { diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index 5ba1b9e0a40..a843fb072b6 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -43,7 +43,7 @@ impl super::Nexus { // Now store the artifacts in the database. let response = self .db_datastore - .update_tuf_repo_insert(opctx, artifacts_with_plan.description()) + .tuf_repo_insert(opctx, artifacts_with_plan.description()) .await .map_err(HttpError::from)?; @@ -88,7 +88,7 @@ impl super::Nexus { let tuf_repo_description = self .db_datastore - .update_tuf_repo_get_by_version(opctx, system_version.into()) + .tuf_repo_get_by_version(opctx, system_version.into()) .await .map_err(HttpError::from)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ed0089f4c8d..adafe160f61 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6575,7 +6575,7 @@ impl NexusExternalApi for NexusExternalApiImpl { // Fetch the TUF repo metadata and update the target release. let tuf_repo_id = nexus .datastore() - .update_tuf_repo_get_by_version(&opctx, system_version.into()) + .tuf_repo_get_by_version(&opctx, system_version.into()) .await? .repo .id; diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 9e2a6ed95a2..e030a6342b1 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -114,7 +114,7 @@ async fn test_repo_upload() -> Result<()> { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 1u32.into() ); @@ -162,7 +162,7 @@ async fn test_repo_upload() -> Result<()> { })); // The generation number should now be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -215,7 +215,7 @@ async fn test_repo_upload() -> Result<()> { // We didn't insert a new repo, so the generation number should still be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -361,7 +361,7 @@ async fn test_repo_upload() -> Result<()> { } // No artifacts changed, so the generation number should still be 2... assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); // ... and the task should have nothing to do and should immediately From a7c1e1752972e95bf0f61774a6febf03df394130 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:15:57 -0600 Subject: [PATCH 09/18] Fix typed UUID --- nexus/db-queries/src/db/datastore/update.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index c27d44d797a..f38aa73d5d7 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -17,7 +17,9 @@ use diesel::result::Error as DieselError; use nexus_db_errors::OptionalError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_lookup::DbConnection; -use nexus_db_model::{ArtifactHash, TufArtifact, TufRepo, TufRepoDescription}; +use nexus_db_model::{ + ArtifactHash, TufArtifact, TufRepo, TufRepoDescription, to_db_typed_uuid, +}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, @@ -118,9 +120,8 @@ impl DataStore { use nexus_db_schema::schema::tuf_repo::dsl; let conn = self.pool_connection_authorized(opctx).await?; - let repo_id = repo_id.into_untyped_uuid(); let repo = dsl::tuf_repo - .filter(dsl::id.eq(repo_id)) + .filter(dsl::id.eq(to_db_typed_uuid(repo_id))) .select(TufRepo::as_select()) .first_async::(&*conn) .await @@ -129,7 +130,7 @@ impl DataStore { e, ErrorHandler::NotFoundByLookup( ResourceType::TufRepo, - LookupType::ById(repo_id), + LookupType::ById(repo_id.into_untyped_uuid()), ), ) })?; From 57f7154f60dc1fc0bfcbaf0b9df725137cc0777b Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:31:09 -0600 Subject: [PATCH 10/18] =?UTF-8?q?Rename=20ControlPlaneZone=20=E2=86=92=20N?= =?UTF-8?q?onNexusOmicronZone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../planning/src/blueprint_builder/builder.rs | 2 +- nexus/reconfigurator/planning/src/planner.rs | 4 ++-- .../planning/src/planner/update_sequence.rs | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index d3f7854d089..6b33dcd6633 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1930,7 +1930,7 @@ impl<'a> BlueprintBuilder<'a> { let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); if let Some(prev) = OrderedComponent::from(zone_kind).prev() { - if prev >= OrderedComponent::ControlPlaneZone + if prev >= OrderedComponent::NonNexusOmicronZone && self.sled_ids_with_zones().any(|sled_id| { self.current_sled_zones( sled_id, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 8ca2810b0f3..e94bffd6093 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4876,7 +4876,7 @@ pub(crate) mod test { input_builder.policy_mut().tuf_repo = Some(tuf_repo); let input = input_builder.build(); - let mut comp = OrderedComponent::ControlPlaneZone; + let mut comp = OrderedComponent::NonNexusOmicronZone; let mut parent = blueprint1; const MAX_PLANNING_ITERATIONS: usize = 100; for i in 2..=MAX_PLANNING_ITERATIONS { @@ -4915,7 +4915,7 @@ pub(crate) mod test { OrderedComponent::HostOs | OrderedComponent::SpRot => { unreachable!("can only update zones") } - OrderedComponent::ControlPlaneZone => { + OrderedComponent::NonNexusOmicronZone => { assert!( blueprint .all_omicron_zones( diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index b675dd3dec8..873ec520751 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -14,7 +14,7 @@ use strum::{EnumIter, IntoEnumIterator as _}; pub enum OrderedComponent { HostOs, SpRot, - ControlPlaneZone, + NonNexusOmicronZone, NexusZone, } @@ -53,7 +53,7 @@ impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { match zone_kind { ZoneKind::Nexus => Self::NexusZone, - _ => Self::ControlPlaneZone, + _ => Self::NonNexusOmicronZone, } } } @@ -76,19 +76,19 @@ mod test { ); assert_eq!( OrderedComponent::SpRot.next(), - Some(OrderedComponent::ControlPlaneZone) + Some(OrderedComponent::NonNexusOmicronZone) ); assert_eq!( - OrderedComponent::ControlPlaneZone.prev(), + OrderedComponent::NonNexusOmicronZone.prev(), Some(OrderedComponent::SpRot) ); assert_eq!( - OrderedComponent::ControlPlaneZone.next(), + OrderedComponent::NonNexusOmicronZone.next(), Some(OrderedComponent::NexusZone) ); assert_eq!( OrderedComponent::NexusZone.prev(), - Some(OrderedComponent::ControlPlaneZone) + Some(OrderedComponent::NonNexusOmicronZone) ); assert_eq!(OrderedComponent::NexusZone.next(), None); assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); @@ -98,7 +98,7 @@ mod test { fn ordered_component_from_zone_kind() { assert!(matches!( OrderedComponent::from(ZoneKind::CruciblePantry), - OrderedComponent::ControlPlaneZone + OrderedComponent::NonNexusOmicronZone )); assert!(matches!( OrderedComponent::from(ZoneKind::Nexus), From 77b21563f48fd868565136fa212a4428fa56a683 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:14:21 -0600 Subject: [PATCH 11/18] Simplify OrderedComponent logic --- .../planning/src/blueprint_builder/builder.rs | 36 ++++---- nexus/reconfigurator/planning/src/planner.rs | 2 +- .../planning/src/planner/update_sequence.rs | 86 +------------------ 3 files changed, 23 insertions(+), 101 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 6b33dcd6633..c64fe5bc6bd 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1929,28 +1929,32 @@ impl<'a> BlueprintBuilder<'a> { let old_repo = self.input.old_repo(); let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); - if let Some(prev) = OrderedComponent::from(zone_kind).prev() { - if prev >= OrderedComponent::NonNexusOmicronZone - && self.sled_ids_with_zones().any(|sled_id| { + match OrderedComponent::from(zone_kind) { + // Nexus can only be updated if all non-Nexus zones have been updated. + OrderedComponent::NexusZone => { + if self.sled_ids_with_zones().any(|sled_id| { self.current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .any(|z| { - let kind = z.zone_type.kind(); - let old_artifact = - Self::zone_image_artifact(old_repo, kind); - OrderedComponent::from(kind) == prev - && z.image_source == old_artifact + .filter(|z| { + OrderedComponent::from(z.zone_type.kind()) + == OrderedComponent::NonNexusOmicronZone }) - }) - { - old_artifact - } else { - new_artifact + .any(|z| z.image_source != new_artifact) + }) { + // Some dependent zone is not up-to-date. + old_artifact + } else { + // All dependent zones are up-to-date. + new_artifact + } + } + // It's always safe to use the new artifact for non-Nexus zones. + OrderedComponent::NonNexusOmicronZone => new_artifact, + OrderedComponent::HostOs | OrderedComponent::SpRot => { + unreachable!("can't get an OS or SP/RoT image from a zone") } - } else { - new_artifact } } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index e94bffd6093..5ea6497e81d 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4924,7 +4924,7 @@ pub(crate) mod test { .all(|(_, zone)| match zone.zone_type.kind() { ZoneKind::Nexus => { if zone.image_source == image_source { - comp = comp.next().unwrap(); + comp = OrderedComponent::NexusZone; true } else { zone.image_source diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index 873ec520751..066bc18d9b6 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -5,12 +5,10 @@ //! Updatable components ordered by dependencies (RFD 565). use nexus_sled_agent_shared::inventory::ZoneKind; -use strum::{EnumIter, IntoEnumIterator as _}; /// Update sequence as defined by RFD 565 §6. -#[derive( - Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter, PartialOrd, Ord, -)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[allow(dead_code)] pub enum OrderedComponent { HostOs, SpRot, @@ -18,37 +16,6 @@ pub enum OrderedComponent { NexusZone, } -/// To implement the checks in RFD 565 §9, we need to go backwards -/// (and mabye forwards) through the component ordering. The simple -/// linear scans here are fine as long as `OrderedComponent` has only -/// a few variants; if it starts getting large (e.g., programatically -/// generated from `ls-apis`), we can switch to something faster. -impl OrderedComponent { - #[allow(dead_code)] - pub fn next(&self) -> Option { - let mut found = false; - for comp in OrderedComponent::iter() { - if found { - return Some(comp); - } else if comp == *self { - found = true; - } - } - None - } - - pub fn prev(&self) -> Option { - let mut prev = None; - for comp in OrderedComponent::iter() { - if comp == *self { - return prev; - } - prev = Some(comp); - } - prev - } -} - impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { match zone_kind { @@ -57,52 +24,3 @@ impl From for OrderedComponent { } } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn ordered_component_next_prev() { - // Exhaustive checks ok for a few variants, revisit if it grows. - assert_eq!(OrderedComponent::HostOs.prev(), None); - assert_eq!( - OrderedComponent::HostOs.next(), - Some(OrderedComponent::SpRot) - ); - assert_eq!( - OrderedComponent::SpRot.prev(), - Some(OrderedComponent::HostOs) - ); - assert_eq!( - OrderedComponent::SpRot.next(), - Some(OrderedComponent::NonNexusOmicronZone) - ); - assert_eq!( - OrderedComponent::NonNexusOmicronZone.prev(), - Some(OrderedComponent::SpRot) - ); - assert_eq!( - OrderedComponent::NonNexusOmicronZone.next(), - Some(OrderedComponent::NexusZone) - ); - assert_eq!( - OrderedComponent::NexusZone.prev(), - Some(OrderedComponent::NonNexusOmicronZone) - ); - assert_eq!(OrderedComponent::NexusZone.next(), None); - assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); - } - - #[test] - fn ordered_component_from_zone_kind() { - assert!(matches!( - OrderedComponent::from(ZoneKind::CruciblePantry), - OrderedComponent::NonNexusOmicronZone - )); - assert!(matches!( - OrderedComponent::from(ZoneKind::Nexus), - OrderedComponent::NexusZone - )); - } -} From e31fc60985db98643ba702fbf2e9c8279915e62e Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:41:49 -0600 Subject: [PATCH 12/18] Simplify out-of-date zone collection --- nexus/reconfigurator/planning/src/planner.rs | 22 +++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 5ea6497e81d..384a442ebb9 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -37,7 +37,6 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; -use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -888,22 +887,22 @@ impl<'a> Planner<'a> { .collect::>(); // Wait for all current zones to appear in the inventory. - // It would be nice if we could check their image source, - // but the inventory doesn't report that correctly: see + // TODO-correctness: We should check their image source, + // but the inventory doesn't report that yet; see // . let inventory_zones = self .inventory .all_omicron_zones() - .map(|z| (z.id, z)) - .collect::>(); - for sled_id in sleds.iter().cloned() { + .map(|z| z.id) + .collect::>(); + for &sled_id in &sleds { if !self .blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .all(|zone| inventory_zones.contains_key(&zone.id)) + .all(|zone| inventory_zones.contains(&zone.id)) { info!( self.log, "zones not yet up-to-date"; @@ -917,19 +916,18 @@ impl<'a> Planner<'a> { let mut out_of_date_zones = sleds .into_iter() .flat_map(|sled_id| { - self.blueprint + let blueprint = &self.blueprint; + blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .filter_map(|zone| { + .filter_map(move |zone| { (zone.image_source - != self - .blueprint + != blueprint .zone_image_source(zone.zone_type.kind())) .then(|| (sled_id, zone.clone())) }) - .collect::>() }) .collect::>(); From 7f1f52244c6a4d9eac2ad677323a8903f32065f5 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:43:56 -0600 Subject: [PATCH 13/18] Make planning error an error --- nexus/reconfigurator/planning/src/blueprint_builder/builder.rs | 2 ++ nexus/reconfigurator/planning/src/planner.rs | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index c64fe5bc6bd..50686c096d0 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -125,6 +125,8 @@ pub enum Error { AllocateExternalNetworking(#[from] ExternalNetworkingError), #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] PolicySpecifiesTooManyInternalDnsServers, + #[error("zone is already up-to-date and should not be updated")] + ZoneAlreadyUpToDate, } /// Describes the result of an idempotent "ensure" operation diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 384a442ebb9..02f8b5abe1f 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -951,13 +951,14 @@ impl<'a> Planner<'a> { let image_source = self.blueprint.zone_image_source(zone_kind); if zone.image_source == image_source { // This should only happen in the event of a planning error above. - warn!( + error!( self.log, "zone is already up-to-date"; "sled_id" => %sled_id, "zone_id" => %zone.id, "kind" => ?zone.zone_type.kind(), "image_source" => %image_source, ); + return Err(Error::ZoneAlreadyUpToDate); } else { match zone_kind { ZoneKind::Crucible From 050933ef758a22e473be7f0b18091e3565d20e74 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:47:26 -0600 Subject: [PATCH 14/18] Explicitly list which zone kinds get in-place updates --- nexus/reconfigurator/planning/src/planner.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 02f8b5abe1f..21d2d5b7605 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -962,8 +962,10 @@ impl<'a> Planner<'a> { } else { match zone_kind { ZoneKind::Crucible - | ZoneKind::CockroachDb - | ZoneKind::Clickhouse => { + | ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer + | ZoneKind::CockroachDb => { info!( self.log, "updating zone image source in-place"; "sled_id" => %sled_id, @@ -977,7 +979,13 @@ impl<'a> Planner<'a> { image_source, )?; } - _ => { + ZoneKind::BoundaryNtp + | ZoneKind::CruciblePantry + | ZoneKind::ExternalDns + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Nexus + | ZoneKind::Oximeter => { info!( self.log, "expunging out-of-date zone"; "sled_id" => %sled_id, From 2daabf5126ad1c83ac231c04aa75182e7cbaeea7 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:50:39 -0600 Subject: [PATCH 15/18] Uncomment test assertion --- nexus/reconfigurator/planning/src/planner.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 21d2d5b7605..9f6fd623a90 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4549,13 +4549,13 @@ pub(crate) mod test { // We should start with no specified TUF repo and nothing to do. assert!(example.input.tuf_repo().is_none()); - // assert_planning_makes_no_changes( - // &logctx.log, - // &blueprint1, - // &example.input, - // &example.collection, - // TEST_NAME, - // ); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint1, + &example.input, + &example.collection, + TEST_NAME, + ); // All zones should be sourced from the install dataset by default. assert!( From 88733d257718496cd1492da26c34dd409eb82c42 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 13:00:53 -0600 Subject: [PATCH 16/18] Type-safe fake-zone names --- nexus/reconfigurator/planning/src/planner.rs | 70 ++++++++------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 9f6fd623a90..66d3ffffb76 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4532,6 +4532,20 @@ pub(crate) mod test { example.system.to_collection_builder().unwrap().build(); } + macro_rules! fake_zone_artifact { + ($kind: ident, $version: expr) => { + TufArtifactMeta { + id: ArtifactId { + name: ZoneKind::$kind.artifact_name().to_string(), + version: $version, + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: ArtifactHash([0; 32]), + size: 0, + } + }; + } + #[test] fn test_update_crucible_pantry() { static TEST_NAME: &str = "update_crucible_pantry"; @@ -4616,24 +4630,8 @@ pub(crate) mod test { hash: fake_hash, }; let artifacts = vec![ - TufArtifactMeta { - id: ArtifactId { - name: String::from("crucible-pantry-zone"), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - }, - TufArtifactMeta { - id: ArtifactId { - name: String::from("nexus"), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - }, + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), ]; input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { repo: TufRepoMeta { @@ -4845,19 +4843,6 @@ pub(crate) mod test { }, hash: fake_hash, }; - macro_rules! zone_artifact { - ($name:expr) => { - TufArtifactMeta { - id: ArtifactId { - name: String::from($name), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - } - }; - } let tuf_repo = TufRepoDescription { repo: TufRepoMeta { hash: fake_hash, @@ -4867,17 +4852,18 @@ pub(crate) mod test { file_name: String::from(""), }, artifacts: vec![ - zone_artifact!("ntp"), - zone_artifact!("clickhouse"), - zone_artifact!("clickhouse_keeper"), - zone_artifact!("cockroachdb"), - zone_artifact!("crucible-zone"), - zone_artifact!("crucible-pantry-zone"), - zone_artifact!("external-dns"), - zone_artifact!("internal-dns"), - zone_artifact!("ntp"), - zone_artifact!("nexus"), - zone_artifact!("oximeter"), + fake_zone_artifact!(BoundaryNtp, version.clone()), + fake_zone_artifact!(Clickhouse, version.clone()), + fake_zone_artifact!(ClickhouseKeeper, version.clone()), + fake_zone_artifact!(ClickhouseServer, version.clone()), + fake_zone_artifact!(CockroachDb, version.clone()), + fake_zone_artifact!(Crucible, version.clone()), + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(ExternalDns, version.clone()), + fake_zone_artifact!(InternalDns, version.clone()), + fake_zone_artifact!(InternalNtp, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), + fake_zone_artifact!(Oximeter, version.clone()), ], }; input_builder.policy_mut().tuf_repo = Some(tuf_repo); From 3ae171ef4d07b9be399c1c68838741001c8d5b5a Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 13:53:53 -0600 Subject: [PATCH 17/18] Fix doc bug from renamed method --- nexus/db-queries/src/db/datastore/update.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index f38aa73d5d7..6dcee53e5c8 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -31,7 +31,7 @@ use swrite::{SWrite, swrite}; use tufaceous_artifact::ArtifactVersion; use uuid::Uuid; -/// The return value of [`DataStore::update_tuf_repo_insert`]. +/// The return value of [`DataStore::tuf_repo_insert`]. /// /// This is similar to [`external::TufRepoInsertResponse`], but uses /// nexus-db-model's types instead of external types. From 3611b428c99ed9d2b993b2eb7db2f4dc57b2aeb1 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 21 May 2025 12:03:16 -0600 Subject: [PATCH 18/18] Refactor zone update readiness check --- .../planning/src/blueprint_builder/builder.rs | 64 +++++---- nexus/reconfigurator/planning/src/planner.rs | 131 +++++++++++------- .../planning/src/planner/update_sequence.rs | 8 +- 3 files changed, 120 insertions(+), 83 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index a8352019acf..c63cb78e5c1 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1947,34 +1947,48 @@ impl<'a> BlueprintBuilder<'a> { ) -> BlueprintZoneImageSource { let new_repo = self.input.tuf_repo(); let old_repo = self.input.old_repo(); - let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); - let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); + Self::zone_image_artifact( + if self.zone_is_ready_for_update(zone_kind, new_repo) { + new_repo + } else { + old_repo + }, + zone_kind, + ) + } + + /// Return `true` iff a zone of the given kind is ready to be updated; + /// i.e., its dependencies have been updated, or its data sufficiently + /// replicated, etc. + fn zone_is_ready_for_update( + &self, + zone_kind: ZoneKind, + new_repo: Option<&TufRepoDescription>, + ) -> bool { match OrderedComponent::from(zone_kind) { - // Nexus can only be updated if all non-Nexus zones have been updated. - OrderedComponent::NexusZone => { - if self.sled_ids_with_zones().any(|sled_id| { - self.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .filter(|z| { - OrderedComponent::from(z.zone_type.kind()) - == OrderedComponent::NonNexusOmicronZone - }) - .any(|z| z.image_source != new_artifact) - }) { - // Some dependent zone is not up-to-date. - old_artifact - } else { - // All dependent zones are up-to-date. - new_artifact - } - } - // It's always safe to use the new artifact for non-Nexus zones. - OrderedComponent::NonNexusOmicronZone => new_artifact, OrderedComponent::HostOs | OrderedComponent::SpRot => { - unreachable!("can't get an OS or SP/RoT image from a zone") + todo!("can't yet update Host OS or SP/RoT") } + OrderedComponent::OmicronZone(kind) => match kind { + ZoneKind::Nexus => { + // Nexus can only be updated if all non-Nexus zones have been updated, + // i.e., their image source is an artifact from the new repo. + self.sled_ids_with_zones().all(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter(|z| z.zone_type.kind() != ZoneKind::Nexus) + .all(|z| { + z.image_source + == Self::zone_image_artifact(new_repo, z.kind()) + }) + }) + } + // + // ZoneKind::CockroachDb => todo!("check cluster status in inventory"), + _ => true, // other zone kinds have no special dependencies + }, } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 655a44b7004..02387f7aedd 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4548,6 +4548,8 @@ pub(crate) mod test { }; } + /// Ensure that dependent zones (here just Crucible Pantry) are updated + /// before Nexus. #[test] fn test_update_crucible_pantry() { static TEST_NAME: &str = "update_crucible_pantry"; @@ -4654,6 +4656,9 @@ pub(crate) mod test { BlueprintZoneImageSource::InstallDataset ) }; + let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() && zone.image_source == image_source + }; let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool { zone.zone_type.is_crucible_pantry() && matches!( @@ -4765,7 +4770,7 @@ pub(crate) mod test { // One more iteration for the last old zone to be expunged. update_collection_from_blueprint(&mut example, &parent); - let blueprint = Planner::new_based_on( + let blueprint10 = Planner::new_based_on( log.clone(), &parent, &input, @@ -4777,16 +4782,17 @@ pub(crate) mod test { .plan() .expect("replan for last blueprint"); + // All old Pantry zones should now be expunged. assert_eq!( - blueprint + blueprint10 .all_omicron_zones(BlueprintZoneDisposition::is_expunged) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // We won't be able to update Nexus until *all* of its dependent zones - // are updated. Stay tuned for the next test! + // Now we can update Nexus, because all of its dependent zones + // are up-to-date w/r/t the new repo. assert_eq!( parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) @@ -4794,12 +4800,58 @@ pub(crate) mod test { .count(), NEXUS_REDUNDANCY + 1, ); + let mut parent = blueprint10; + for i in 11..=17 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + eprintln!("{}", summary.display()); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert_eq!(added.image_source, image_source); + } + } + + parent = blueprint; + } // Everything's up-to-date in Kansas City! - update_collection_from_blueprint(&mut example, &blueprint); + let blueprint17 = parent; + assert_eq!( + blueprint17 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + NEXUS_REDUNDANCY + 1, + ); + + update_collection_from_blueprint(&mut example, &blueprint17); assert_planning_makes_no_changes( &logctx.log, - &blueprint, + &blueprint17, &input, &example.collection, TEST_NAME, @@ -4808,6 +4860,7 @@ pub(crate) mod test { logctx.cleanup_successful(); } + /// Ensure that planning to update all zones terminates. #[test] fn test_update_all_zones() { static TEST_NAME: &str = "update_all_zones"; @@ -4871,9 +4924,17 @@ pub(crate) mod test { input_builder.policy_mut().tuf_repo = Some(tuf_repo); let input = input_builder.build(); - let mut comp = OrderedComponent::NonNexusOmicronZone; - let mut parent = blueprint1; + /// Expected number of planner iterations required to converge. + /// If incidental planner work changes this value occasionally, + /// that's fine; but if we find we're changing it all the time, + /// we should probably drop it and keep just the maximum below. + const EXP_PLANNING_ITERATIONS: usize = 57; + + /// Planning must not take more than this number of iterations. const MAX_PLANNING_ITERATIONS: usize = 100; + assert!(EXP_PLANNING_ITERATIONS < MAX_PLANNING_ITERATIONS); + + let mut parent = blueprint1; for i in 2..=MAX_PLANNING_ITERATIONS { let blueprint_name = format!("blueprint_{i}"); update_collection_from_blueprint(&mut example, &parent); @@ -4894,62 +4955,28 @@ pub(crate) mod test { && summary.total_zones_removed() == 0 && summary.total_zones_modified() == 0 { - println!("planning converged after {i} iterations"); assert!( blueprint .all_omicron_zones( BlueprintZoneDisposition::is_in_service ) - .all(|(_, zone)| zone.image_source == image_source) + .all(|(_, zone)| zone.image_source == image_source), + "failed to update all zones" + ); + + assert_eq!( + i, EXP_PLANNING_ITERATIONS, + "expected {EXP_PLANNING_ITERATIONS} iterations but converged in {i}" ); + println!("planning converged after {i} iterations"); + logctx.cleanup_successful(); return; } - match comp { - OrderedComponent::HostOs | OrderedComponent::SpRot => { - unreachable!("can only update zones") - } - OrderedComponent::NonNexusOmicronZone => { - assert!( - blueprint - .all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) - .all(|(_, zone)| match zone.zone_type.kind() { - ZoneKind::Nexus => { - if zone.image_source == image_source { - comp = OrderedComponent::NexusZone; - true - } else { - zone.image_source - == BlueprintZoneImageSource::InstallDataset - } - } - _ => zone.image_source - == BlueprintZoneImageSource::InstallDataset - || zone.image_source == image_source, - }) - ); - } - OrderedComponent::NexusZone => { - assert!( - blueprint - .all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) - .all(|(_, zone)| match zone.zone_type.kind() { - ZoneKind::Nexus => zone.image_source - == BlueprintZoneImageSource::InstallDataset - || zone.image_source == image_source, - _ => zone.image_source == image_source, - }) - ); - } - } - parent = blueprint; } + panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } } diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index 066bc18d9b6..38d28d06e2a 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -12,15 +12,11 @@ use nexus_sled_agent_shared::inventory::ZoneKind; pub enum OrderedComponent { HostOs, SpRot, - NonNexusOmicronZone, - NexusZone, + OmicronZone(ZoneKind), } impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { - match zone_kind { - ZoneKind::Nexus => Self::NexusZone, - _ => Self::NonNexusOmicronZone, - } + Self::OmicronZone(zone_kind) } }