diff --git a/Cargo.lock b/Cargo.lock index b1e403f8573..fcae953df13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6464,6 +6464,7 @@ dependencies = [ "oxnet", "proptest", "rand 0.8.5", + "semver 1.0.26", "sled-agent-client", "slog", "slog-error-chain", diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index a7eef17b2fb..d6d44558fc6 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -748,6 +748,10 @@ impl Generation { ); Generation(next_gen) } + + pub const fn prev(&self) -> Option { + if self.0 > 1 { Some(Generation(self.0 - 1)) } else { None } + } } impl<'de> Deserialize<'de> for Generation { diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout index 8895083a72b..e6f2c202c8a 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout @@ -628,6 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +INFO zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 4e6653ea808..9dfd6e787f8 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -22,6 +22,7 @@ use omicron_common::{ DatasetConfig, DatasetManagementStatus, DiskManagementStatus, DiskVariant, OmicronPhysicalDiskConfig, }, + update::ArtifactId, zpool_name::ZpoolName, }; use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid}; @@ -33,7 +34,7 @@ use serde::{Deserialize, Serialize}; // depend on sled-hardware-types. pub use sled_hardware_types::Baseboard; use strum::EnumIter; -use tufaceous_artifact::ArtifactHash; +use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; /// Identifies information about disks which may be attached to Sleds. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] @@ -577,13 +578,14 @@ impl OmicronZoneType { /// /// # String representations of this type /// -/// There are no fewer than four string representations for this type, all +/// There are no fewer than five string representations for this type, all /// slightly different from each other. /// /// 1. [`Self::zone_prefix`]: Used to construct zone names. /// 2. [`Self::service_prefix`]: Used to construct SMF service names. /// 3. [`Self::name_prefix`]: Used to construct `Name` instances. /// 4. [`Self::report_str`]: Used for reporting and testing. +/// 5. [`Self::artifact_name`]: Used to match TUF artifact names. /// /// There is no `Display` impl to ensure that users explicitly choose the /// representation they want. (Please play close attention to this! The @@ -702,6 +704,39 @@ impl ZoneKind { ZoneKind::Oximeter => "oximeter", } } + + /// Return a string used as an artifact name for control-plane zones. + /// This is **not guaranteed** to be stable. + pub fn artifact_name(self) -> &'static str { + match self { + ZoneKind::BoundaryNtp => "ntp", + ZoneKind::Clickhouse => "clickhouse", + ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse", + ZoneKind::CockroachDb => "cockroachdb", + ZoneKind::Crucible => "crucible-zone", + ZoneKind::CruciblePantry => "crucible-pantry-zone", + ZoneKind::ExternalDns => "external-dns", + ZoneKind::InternalDns => "internal-dns", + ZoneKind::InternalNtp => "ntp", + ZoneKind::Nexus => "nexus", + ZoneKind::Oximeter => "oximeter", + } + } + + /// Return true if an artifact represents a control plane zone image + /// of this kind. + pub fn is_control_plane_zone_artifact( + self, + artifact_id: &ArtifactId, + ) -> bool { + artifact_id + .kind + .to_known() + .map(|kind| matches!(kind, KnownArtifactKind::Zone)) + .unwrap_or(false) + && artifact_id.name == self.artifact_name() + } } /// Where Sled Agent should get the image for a zone. diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 62abfacdab2..09ea6ef5025 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1635,6 +1635,7 @@ impl InvOmicronZone { .filesystem_pool .map(|id| ZpoolName::new_external(id.into())), zone_type, + // FIXME image_source: OmicronZoneImageSource::InstallDataset, }) } diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 809df30aa16..5f4205b080e 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -2249,7 +2249,7 @@ mod tests { const SYSTEM_HASH: ArtifactHash = ArtifactHash([3; 32]); datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index e98a5ce5209..a6f82edff99 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -7,7 +7,9 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; -use crate::db::model::{SemverVersion, TargetRelease, TargetReleaseSource}; +use crate::db::model::{ + Generation, SemverVersion, TargetRelease, TargetReleaseSource, +}; use async_bb8_diesel::AsyncRunQueryDsl as _; use diesel::insert_into; use diesel::prelude::*; @@ -44,6 +46,25 @@ impl DataStore { Ok(current) } + /// Fetch a target release by generation number. + pub async fn target_release_get_generation( + &self, + opctx: &OpContext, + generation: Generation, + ) -> LookupResult> { + opctx + .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) + .await?; + let conn = self.pool_connection_authorized(opctx).await?; + dsl::target_release + .select(TargetRelease::as_select()) + .filter(dsl::generation.eq(generation)) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Insert a new target release row and return it. It will only become /// the current target release if its generation is larger than any /// existing row. @@ -189,7 +210,7 @@ mod test { .parse() .expect("SHA256('')"); let repo = datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index cd0127cf979..6dcee53e5c8 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -17,18 +17,21 @@ use diesel::result::Error as DieselError; use nexus_db_errors::OptionalError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_lookup::DbConnection; -use nexus_db_model::{ArtifactHash, TufArtifact, TufRepo, TufRepoDescription}; +use nexus_db_model::{ + ArtifactHash, TufArtifact, TufRepo, TufRepoDescription, to_db_typed_uuid, +}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, }; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use swrite::{SWrite, swrite}; use tufaceous_artifact::ArtifactVersion; use uuid::Uuid; -/// The return value of [`DataStore::update_tuf_repo_insert`]. +/// The return value of [`DataStore::tuf_repo_insert`]. /// /// This is similar to [`external::TufRepoInsertResponse`], but uses /// nexus-db-model's types instead of external types. @@ -75,7 +78,7 @@ impl DataStore { /// `TufRepoDescription` if one was already found. (This is not an upsert, /// because if we know about an existing repo but with different contents, /// we reject that.) - pub async fn update_tuf_repo_insert( + pub async fn tuf_repo_insert( &self, opctx: &OpContext, description: &external::TufRepoDescription, @@ -106,8 +109,40 @@ impl DataStore { }) } + /// Returns a TUF repo description. + pub async fn tuf_repo_get_by_id( + &self, + opctx: &OpContext, + repo_id: TypedUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use nexus_db_schema::schema::tuf_repo::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let repo = dsl::tuf_repo + .filter(dsl::id.eq(to_db_typed_uuid(repo_id))) + .select(TufRepo::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::TufRepo, + LookupType::ById(repo_id.into_untyped_uuid()), + ), + ) + })?; + + let artifacts = artifacts_for_repo(repo.id.into(), &conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + Ok(TufRepoDescription { repo, artifacts }) + } + /// Returns the TUF repo description corresponding to this system version. - pub async fn update_tuf_repo_get( + pub async fn tuf_repo_get_by_version( &self, opctx: &OpContext, system_version: SemverVersion, @@ -140,7 +175,7 @@ impl DataStore { } /// Returns the list of all TUF repo artifacts known to the system. - pub async fn update_tuf_artifact_list( + pub async fn tuf_list_repos( &self, opctx: &OpContext, generation: Generation, @@ -160,7 +195,7 @@ impl DataStore { } /// Returns the current TUF repo generation number. - pub async fn update_tuf_generation_get( + pub async fn tuf_get_generation( &self, opctx: &OpContext, ) -> LookupResult { diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 81126677e7e..b33045639b2 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1425,6 +1425,8 @@ mod test { target_crucible_pantry_zone_count: CRUCIBLE_PANTRY_REDUNDANCY, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, + old_repo: None, log, } .build() diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 695e65eb584..465c838741d 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -29,6 +29,7 @@ omicron-uuid-kinds.workspace = true once_cell.workspace = true oxnet.workspace = true rand.workspace = true +semver.workspace = true sled-agent-client.workspace = true slog.workspace = true slog-error-chain.workspace = true diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 2e889f2d2d4..c63cb78e5c1 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -14,6 +14,7 @@ use crate::blueprint_editor::ExternalSnatNetworkingChoice; use crate::blueprint_editor::NoAvailableDnsSubnets; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; +use crate::planner::OrderedComponent; use crate::planner::ZoneExpungeReason; use crate::planner::rng::PlannerRng; use anyhow::Context as _; @@ -56,6 +57,7 @@ use omicron_common::address::DNS_PORT; use omicron_common::address::NTP_PORT; use omicron_common::address::ReservedRackSubnet; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::external::Vni; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; @@ -124,6 +126,8 @@ pub enum Error { AllocateExternalNetworking(#[from] ExternalNetworkingError), #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] PolicySpecifiesTooManyInternalDnsServers, + #[error("zone is already up-to-date and should not be updated")] + ZoneAlreadyUpToDate, } /// Describes the result of an idempotent "ensure" operation @@ -1161,13 +1165,14 @@ impl<'a> BlueprintBuilder<'a> { gz_address: dns_subnet.gz_address(), gz_address_index, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool: zpool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) @@ -1213,13 +1218,14 @@ impl<'a> BlueprintBuilder<'a> { dns_address, nic, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1252,13 +1258,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone)?; @@ -1404,13 +1411,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: nexus_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1429,13 +1437,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: oximeter_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1453,13 +1462,14 @@ impl<'a> BlueprintBuilder<'a> { ); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: pantry_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1487,13 +1497,14 @@ impl<'a> BlueprintBuilder<'a> { dataset: OmicronZoneDataset { pool_name }, }); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1513,13 +1524,14 @@ impl<'a> BlueprintBuilder<'a> { address, dataset: OmicronZoneDataset { pool_name }, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1541,13 +1553,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1569,13 +1582,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1695,6 +1709,7 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); self.sled_add_zone( sled_id, @@ -1703,7 +1718,7 @@ impl<'a> BlueprintBuilder<'a> { id: new_zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }, ) } @@ -1908,6 +1923,75 @@ impl<'a> BlueprintBuilder<'a> { self.pending_mgs_updates.remove(baseboard_id); } + fn zone_image_artifact( + repo: Option<&TufRepoDescription>, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + repo.and_then(|repo| { + repo.artifacts + .iter() + .find(|artifact| { + zone_kind.is_control_plane_zone_artifact(&artifact.id) + }) + .map(BlueprintZoneImageSource::from_available_artifact) + }) + .unwrap_or(BlueprintZoneImageSource::InstallDataset) + } + + /// Try to find an artifact in either the current or previous release repo + /// that contains an image for a zone of the given kind; see RFD 565 §9. + /// Defaults to the install dataset. + pub(crate) fn zone_image_source( + &self, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + let new_repo = self.input.tuf_repo(); + let old_repo = self.input.old_repo(); + Self::zone_image_artifact( + if self.zone_is_ready_for_update(zone_kind, new_repo) { + new_repo + } else { + old_repo + }, + zone_kind, + ) + } + + /// Return `true` iff a zone of the given kind is ready to be updated; + /// i.e., its dependencies have been updated, or its data sufficiently + /// replicated, etc. + fn zone_is_ready_for_update( + &self, + zone_kind: ZoneKind, + new_repo: Option<&TufRepoDescription>, + ) -> bool { + match OrderedComponent::from(zone_kind) { + OrderedComponent::HostOs | OrderedComponent::SpRot => { + todo!("can't yet update Host OS or SP/RoT") + } + OrderedComponent::OmicronZone(kind) => match kind { + ZoneKind::Nexus => { + // Nexus can only be updated if all non-Nexus zones have been updated, + // i.e., their image source is an artifact from the new repo. + self.sled_ids_with_zones().all(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter(|z| z.zone_type.kind() != ZoneKind::Nexus) + .all(|z| { + z.image_source + == Self::zone_image_artifact(new_repo, z.kind()) + }) + }) + } + // + // ZoneKind::CockroachDb => todo!("check cluster status in inventory"), + _ => true, // other zone kinds have no special dependencies + }, + } + } + /// Debug method to remove a sled from a blueprint entirely. /// /// Bypasses all expungement checks. Do not use in production. diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 38f85e772fc..02387f7aedd 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -18,6 +18,7 @@ use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; +use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -44,9 +45,11 @@ use self::omicron_zone_placement::OmicronZonePlacement; use self::omicron_zone_placement::OmicronZonePlacementSledState; pub use self::rng::PlannerRng; pub use self::rng::SledPlannerRng; +pub(crate) use self::update_sequence::OrderedComponent; mod omicron_zone_placement; pub(crate) mod rng; +mod update_sequence; pub struct Planner<'a> { log: Logger, @@ -110,14 +113,11 @@ impl<'a> Planner<'a> { } fn do_plan(&mut self) -> Result<(), Error> { - // We perform planning in two loops: the first one turns expunged sleds - // into expunged zones, and the second one adds services. - self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; + self.do_plan_zone_updates()?; self.do_plan_cockroachdb_settings(); - Ok(()) } @@ -877,6 +877,128 @@ impl<'a> Planner<'a> { Ok(()) } + /// Update at most one existing zone to use a new image source. + fn do_plan_zone_updates(&mut self) -> Result<(), Error> { + // We are only interested in non-decommissioned sleds. + let sleds = self + .input + .all_sleds(SledFilter::Commissioned) + .map(|(id, _details)| id) + .collect::>(); + + // Wait for all current zones to appear in the inventory. + // TODO-correctness: We should check their image source, + // but the inventory doesn't report that yet; see + // . + let inventory_zones = self + .inventory + .all_omicron_zones() + .map(|z| z.id) + .collect::>(); + for &sled_id in &sleds { + if !self + .blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .all(|zone| inventory_zones.contains(&zone.id)) + { + info!( + self.log, "zones not yet up-to-date"; + "sled_id" => %sled_id, + ); + return Ok(()); + } + } + + // Choose among the out-of-date zones one with the fewest dependencies. + let mut out_of_date_zones = sleds + .into_iter() + .flat_map(|sled_id| { + let blueprint = &self.blueprint; + blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter_map(move |zone| { + (zone.image_source + != blueprint + .zone_image_source(zone.zone_type.kind())) + .then(|| (sled_id, zone.clone())) + }) + }) + .collect::>(); + + sort_zones_by_deps(&mut out_of_date_zones); + if let Some((sled_id, zone)) = out_of_date_zones.first() { + return self.update_or_expunge_zone(*sled_id, zone); + } + + info!(self.log, "all zones up-to-date"); + Ok(()) + } + + /// Update a zone to use a new image source, either in-place or by + /// expunging it and letting it be replaced in a future iteration. + fn update_or_expunge_zone( + &mut self, + sled_id: SledUuid, + zone: &BlueprintZoneConfig, + ) -> Result<(), Error> { + let zone_kind = zone.zone_type.kind(); + let image_source = self.blueprint.zone_image_source(zone_kind); + if zone.image_source == image_source { + // This should only happen in the event of a planning error above. + error!( + self.log, "zone is already up-to-date"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + return Err(Error::ZoneAlreadyUpToDate); + } else { + match zone_kind { + ZoneKind::Crucible + | ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer + | ZoneKind::CockroachDb => { + info!( + self.log, "updating zone image source in-place"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + self.blueprint.sled_set_zone_source( + sled_id, + zone.id, + image_source, + )?; + } + ZoneKind::BoundaryNtp + | ZoneKind::CruciblePantry + | ZoneKind::ExternalDns + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Nexus + | ZoneKind::Oximeter => { + info!( + self.log, "expunging out-of-date zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + } + } + } + Ok(()) + } + fn do_plan_cockroachdb_settings(&mut self) { // Figure out what we should set the CockroachDB "preserve downgrade // option" setting to based on the planning input. @@ -972,6 +1094,15 @@ impl<'a> Planner<'a> { } } +/// Sort in place a vector of sled-specific zone configurations by API +/// dependencies (RFD 565 §6). +fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { + zones.sort_by(|a, b| { + OrderedComponent::from(a.1.zone_type.kind()) + .cmp(&OrderedComponent::from(b.1.zone_type.kind())) + }) +} + /// The reason a sled's zones need to be expunged. /// /// This is used only for introspection and logging -- it's not part of the @@ -986,6 +1117,7 @@ pub(crate) enum ZoneExpungeReason { pub(crate) mod test { use super::*; use crate::blueprint_builder::test::verify_blueprint; + use crate::example::ExampleSystem; use crate::example::ExampleSystemBuilder; use crate::example::SimRngState; use crate::example::example; @@ -1000,6 +1132,8 @@ pub(crate) mod test { use nexus_types::deployment::BlueprintDiffSummary; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneImageSource; + use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; @@ -1010,16 +1144,26 @@ pub(crate) mod test { use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::external_api::views::SledState; use omicron_common::api::external::Generation; + use omicron_common::api::external::TufArtifactMeta; + use omicron_common::api::external::TufRepoDescription; + use omicron_common::api::external::TufRepoMeta; use omicron_common::disk::DatasetKind; use omicron_common::disk::DiskIdentity; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; + use omicron_common::policy::NEXUS_REDUNDANCY; + use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::ZpoolUuid; + use semver::Version; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::HashMap; use std::net::IpAddr; + use tufaceous_artifact::ArtifactHash; + use tufaceous_artifact::ArtifactKind; + use tufaceous_artifact::ArtifactVersion; + use tufaceous_artifact::KnownArtifactKind; use typed_rng::TypedUuidRng; // Generate a ClickhousePolicy ignoring fields we don't care about for @@ -4337,4 +4481,502 @@ pub(crate) mod test { logctx.cleanup_successful(); } + + #[test] + fn test_sort_zones_by_deps() { + static TEST_NAME: &str = "sort_zones_by_deps"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Collect zone configs from our example system. + let (_collection, _input, blueprint) = example(&log, TEST_NAME); + let mut zones = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .map(|(sled_id, z)| (sled_id, z.clone())) + .collect::>(); + + // Sort them and verify the ordering constraints. + sort_zones_by_deps(&mut zones); + let mut nexus = false; + for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { + match kind { + ZoneKind::Nexus => { + nexus = true; + } + _ => { + assert!(!nexus); + } + } + } + assert!(nexus); + + logctx.cleanup_successful(); + } + + /// Manually update the example system's inventory collection's zones + /// from a blueprint. + fn update_collection_from_blueprint( + example: &mut ExampleSystem, + blueprint: &Blueprint, + ) { + for (&sled_id, config) in blueprint.sleds.iter() { + let sled_config = config.clone().into_in_service_sled_config(); + let zones_config = OmicronZonesConfig { + generation: sled_config.generation, + zones: sled_config.zones.into_iter().collect(), + }; + example + .system + .sled_set_omicron_zones(sled_id, zones_config) + .expect("can't set omicron zones for sled"); + } + example.collection = + example.system.to_collection_builder().unwrap().build(); + } + + macro_rules! fake_zone_artifact { + ($kind: ident, $version: expr) => { + TufArtifactMeta { + id: ArtifactId { + name: ZoneKind::$kind.artifact_name().to_string(), + version: $version, + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: ArtifactHash([0; 32]), + size: 0, + } + }; + } + + /// Ensure that dependent zones (here just Crucible Pantry) are updated + /// before Nexus. + #[test] + fn test_update_crucible_pantry() { + static TEST_NAME: &str = "update_crucible_pantry"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + verify_blueprint(&blueprint1); + + // We should start with no specified TUF repo and nothing to do. + assert!(example.input.tuf_repo().is_none()); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint1, + &example.input, + &example.collection, + TEST_NAME, + ); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a trivial TUF repo. + let mut input_builder = example.input.clone().into_builder(); + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: ArtifactHash([0; 32]), + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(0, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![], + }); + let input = input_builder.build(); + let blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))) + .plan() + .expect("plan for trivial TUF repo"); + + // All zones should still be sourced from the install dataset. + assert!( + blueprint2 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a TUF repo with fake zone images for Crucible Pantry + // and Nexus. Only the name and kind of the artifacts matter. The Nexus + // artifact is only there to make sure the planner *doesn't* use it. + let mut input_builder = input.into_builder(); + let version = ArtifactVersion::new_static("1.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + let artifacts = vec![ + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), + ]; + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts, + }); + + // Some helper predicates for the assertions below. + let is_old_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) + }; + let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() && zone.image_source == image_source + }; + let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) + }; + let is_up_to_date_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() + && zone.image_source == image_source + }; + + // Request another Nexus zone. + input_builder.policy_mut().target_nexus_zone_count = + input_builder.policy_mut().target_nexus_zone_count + 1; + let input = input_builder.build(); + + // Check that there is a new nexus zone that does *not* use the new + // artifact (since not all of its dependencies are updated yet). + update_collection_from_blueprint(&mut example, &blueprint2); + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) + .plan() + .expect("can't re-plan for new Nexus zone"); + { + let summary = blueprint3.diff_since_blueprint(&blueprint2); + for sled in summary.diff.sleds.modified_values_diff() { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert!(matches!( + &added.image_source, + BlueprintZoneImageSource::InstallDataset + )); + } + } + + // We should now have three sets of expunge/add iterations for the + // Crucible Pantry zones. + let mut parent = blueprint3; + for i in 4..=9 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + assert_eq!( + sled.zones + .common + .iter() + .filter(|(_, z)| matches!( + z.after.zone_type, + BlueprintZoneType::CruciblePantry(_) + ) && matches!( + z.after.disposition, + BlueprintZoneDisposition::Expunged { .. } + )) + .count(), + 1 + ); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::CruciblePantry(_) + )); + assert_eq!(added.image_source, image_source); + } + } + + parent = blueprint; + } + + // All Crucible Pantries should now be updated. + assert_eq!( + parent + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_pantry(z)) + .count(), + CRUCIBLE_PANTRY_REDUNDANCY + ); + + // One more iteration for the last old zone to be expunged. + update_collection_from_blueprint(&mut example, &parent); + let blueprint10 = Planner::new_based_on( + log.clone(), + &parent, + &input, + "last_blueprint", + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "last_bp"))) + .plan() + .expect("replan for last blueprint"); + + // All old Pantry zones should now be expunged. + assert_eq!( + blueprint10 + .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .filter(|(_, z)| is_old_pantry(z)) + .count(), + CRUCIBLE_PANTRY_REDUNDANCY + ); + + // Now we can update Nexus, because all of its dependent zones + // are up-to-date w/r/t the new repo. + assert_eq!( + parent + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_old_nexus(z)) + .count(), + NEXUS_REDUNDANCY + 1, + ); + let mut parent = blueprint10; + for i in 11..=17 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + eprintln!("{}", summary.display()); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert_eq!(added.image_source, image_source); + } + } + + parent = blueprint; + } + + // Everything's up-to-date in Kansas City! + let blueprint17 = parent; + assert_eq!( + blueprint17 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + NEXUS_REDUNDANCY + 1, + ); + + update_collection_from_blueprint(&mut example, &blueprint17); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint17, + &input, + &example.collection, + TEST_NAME, + ); + + logctx.cleanup_successful(); + } + + /// Ensure that planning to update all zones terminates. + #[test] + fn test_update_all_zones() { + static TEST_NAME: &str = "update_all_zones"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + verify_blueprint(&blueprint1); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a TUF repo with fake images for all zones. + // Only the name and kind of the artifacts matter. + let mut input_builder = example.input.clone().into_builder(); + let version = ArtifactVersion::new_static("2.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + let tuf_repo = TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![ + fake_zone_artifact!(BoundaryNtp, version.clone()), + fake_zone_artifact!(Clickhouse, version.clone()), + fake_zone_artifact!(ClickhouseKeeper, version.clone()), + fake_zone_artifact!(ClickhouseServer, version.clone()), + fake_zone_artifact!(CockroachDb, version.clone()), + fake_zone_artifact!(Crucible, version.clone()), + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(ExternalDns, version.clone()), + fake_zone_artifact!(InternalDns, version.clone()), + fake_zone_artifact!(InternalNtp, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), + fake_zone_artifact!(Oximeter, version.clone()), + ], + }; + input_builder.policy_mut().tuf_repo = Some(tuf_repo); + let input = input_builder.build(); + + /// Expected number of planner iterations required to converge. + /// If incidental planner work changes this value occasionally, + /// that's fine; but if we find we're changing it all the time, + /// we should probably drop it and keep just the maximum below. + const EXP_PLANNING_ITERATIONS: usize = 57; + + /// Planning must not take more than this number of iterations. + const MAX_PLANNING_ITERATIONS: usize = 100; + assert!(EXP_PLANNING_ITERATIONS < MAX_PLANNING_ITERATIONS); + + let mut parent = blueprint1; + for i in 2..=MAX_PLANNING_ITERATIONS { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + if summary.total_zones_added() == 0 + && summary.total_zones_removed() == 0 + && summary.total_zones_modified() == 0 + { + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| zone.image_source == image_source), + "failed to update all zones" + ); + + assert_eq!( + i, EXP_PLANNING_ITERATIONS, + "expected {EXP_PLANNING_ITERATIONS} iterations but converged in {i}" + ); + println!("planning converged after {i} iterations"); + + logctx.cleanup_successful(); + return; + } + + parent = blueprint; + } + + panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); + } } diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs new file mode 100644 index 00000000000..38d28d06e2a --- /dev/null +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -0,0 +1,22 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Updatable components ordered by dependencies (RFD 565). + +use nexus_sled_agent_shared::inventory::ZoneKind; + +/// Update sequence as defined by RFD 565 §6. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[allow(dead_code)] +pub enum OrderedComponent { + HostOs, + SpRot, + OmicronZone(ZoneKind), +} + +impl From for OrderedComponent { + fn from(zone_kind: ZoneKind) -> Self { + Self::OmicronZone(zone_kind) + } +} diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 472bfe3bfa5..fba4f2aabd4 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -44,6 +44,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::address::get_sled_address; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; @@ -96,6 +97,8 @@ pub struct SystemDescription { external_dns_version: Generation, clickhouse_policy: Option, oximeter_read_policy: OximeterReadPolicy, + tuf_repo: Option, + old_repo: Option, } impl SystemDescription { @@ -175,6 +178,8 @@ impl SystemDescription { external_dns_version: Generation::new(), clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, + old_repo: None, } } @@ -454,6 +459,8 @@ impl SystemDescription { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 497d0353124..59e6a8ab6aa 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -7,6 +7,7 @@ use anyhow::Context; use futures::StreamExt; use nexus_db_model::DnsGroup; +use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::DataStoreDnsTest; @@ -39,6 +40,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::LookupType; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; use omicron_common::policy::COCKROACHDB_REDUNDANCY; @@ -78,6 +80,8 @@ pub struct PlanningInputFromDb<'a> { pub cockroachdb_settings: &'a CockroachDbSettings, pub clickhouse_policy: Option, pub oximeter_read_policy: OximeterReadPolicy, + pub tuf_repo: Option, + pub old_repo: Option, pub log: &'a Logger, } @@ -140,11 +144,43 @@ impl PlanningInputFromDb<'_> { .cockroachdb_settings(opctx) .await .internal_context("fetching cockroachdb settings")?; - let clickhouse_policy = datastore .clickhouse_policy_get_latest(opctx) .await .internal_context("fetching clickhouse policy")?; + let target_release = datastore + .target_release_get_current(opctx) + .await + .internal_context("fetching current target release")?; + let tuf_repo = match target_release.tuf_repo_id { + None => None, + Some(repo_id) => Some( + datastore + .tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; + let prev_release = if let Some(prev) = target_release.generation.prev() + { + datastore + .target_release_get_generation(opctx, Generation(prev)) + .await + .internal_context("fetching current target release")? + } else { + None + }; + let old_repo = match prev_release.and_then(|r| r.tuf_repo_id) { + None => None, + Some(repo_id) => Some( + datastore + .tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; let oximeter_read_policy = datastore .oximeter_read_policy_get_latest(opctx) @@ -171,6 +207,8 @@ impl PlanningInputFromDb<'_> { cockroachdb_settings: &cockroachdb_settings, clickhouse_policy, oximeter_read_policy, + tuf_repo, + old_repo, } .build() .internal_context("assembling planning_input")?; @@ -194,6 +232,8 @@ impl PlanningInputFromDb<'_> { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index 0ede39b6bdc..4f7fd5dd70d 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -590,18 +590,13 @@ impl ArtifactReplication { &self, opctx: &OpContext, ) -> Result<(ArtifactConfig, Inventory)> { - let generation = - self.datastore.update_tuf_generation_get(opctx).await?; + let generation = self.datastore.tuf_get_generation(opctx).await?; let mut inventory = Inventory::default(); let mut paginator = Paginator::new(SQL_BATCH_SIZE); while let Some(p) = paginator.next() { let batch = self .datastore - .update_tuf_artifact_list( - opctx, - generation, - &p.current_pagparams(), - ) + .tuf_list_repos(opctx, generation, &p.current_pagparams()) .await?; paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); for artifact in batch { diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index d0e1d05e5d8..a843fb072b6 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -43,7 +43,7 @@ impl super::Nexus { // Now store the artifacts in the database. let response = self .db_datastore - .update_tuf_repo_insert(opctx, artifacts_with_plan.description()) + .tuf_repo_insert(opctx, artifacts_with_plan.description()) .await .map_err(HttpError::from)?; @@ -88,7 +88,7 @@ impl super::Nexus { let tuf_repo_description = self .db_datastore - .update_tuf_repo_get(opctx, system_version.into()) + .tuf_repo_get_by_version(opctx, system_version.into()) .await .map_err(HttpError::from)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 8cd800aeea1..fe99192e79f 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6575,7 +6575,7 @@ impl NexusExternalApi for NexusExternalApiImpl { // Fetch the TUF repo metadata and update the target release. let tuf_repo_id = nexus .datastore() - .update_tuf_repo_get(&opctx, system_version.into()) + .tuf_repo_get_by_version(&opctx, system_version.into()) .await? .repo .id; diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index d6cc504fa11..3c87d3e3150 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -114,7 +114,7 @@ async fn test_repo_upload() -> Result<()> { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 1u32.into() ); @@ -162,7 +162,7 @@ async fn test_repo_upload() -> Result<()> { })); // The generation number should now be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -221,7 +221,7 @@ async fn test_repo_upload() -> Result<()> { // We didn't insert a new repo, so the generation number should still be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -367,7 +367,7 @@ async fn test_repo_upload() -> Result<()> { } // No artifacts changed, so the generation number should still be 2... assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); // ... and the task should have nothing to do and should immediately diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 61f4e38255a..0dcf38f49d2 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -26,6 +26,7 @@ use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use nexus_sled_agent_shared::inventory::ZoneKind; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufArtifactMeta; use omicron_common::api::internal::shared::DatasetKind; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DatasetConfig; @@ -958,7 +959,7 @@ impl fmt::Display for BlueprintZoneDisposition { } } -/// Where a blueprint's image source is located. +/// Where the zone's image source is located. /// /// This is the blueprint version of [`OmicronZoneImageSource`]. #[derive( @@ -997,6 +998,17 @@ pub enum BlueprintZoneImageSource { Artifact { version: BlueprintZoneImageVersion, hash: ArtifactHash }, } +impl BlueprintZoneImageSource { + pub fn from_available_artifact(artifact: &TufArtifactMeta) -> Self { + BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: artifact.id.version.clone(), + }, + hash: artifact.hash, + } + } +} + impl From for OmicronZoneImageSource { fn from(source: BlueprintZoneImageSource) -> Self { match source { diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index d6faffd6961..86e9fb02d94 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -23,6 +23,7 @@ use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::internal::shared::SourceNatConfigError; use omicron_common::disk::DiskIdentity; use omicron_common::policy::SINGLE_NODE_CLICKHOUSE_REDUNDANCY; @@ -152,6 +153,14 @@ impl PlanningInput { .unwrap_or(0) } + pub fn tuf_repo(&self) -> Option<&TufRepoDescription> { + self.policy.tuf_repo.as_ref() + } + + pub fn old_repo(&self) -> Option<&TufRepoDescription> { + self.policy.old_repo.as_ref() + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } @@ -918,6 +927,19 @@ pub struct Policy { /// Eventually we will only allow reads from a cluster and this policy will /// no longer exist. pub oximeter_read_policy: OximeterReadPolicy, + + /// Desired system software release repository. + /// + /// New zones may use artifacts in this repo as their image sources, + /// and at most one extant zone may be modified to use it or replaced + /// with one that does. + pub tuf_repo: Option, + + /// Previous system software release repository. + /// + /// New zones deployed mid-update may use artifacts in this repo as + /// their image sources. See RFD 565 §9. + pub old_repo: Option, } #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -1087,6 +1109,8 @@ impl PlanningInputBuilder { target_crucible_pantry_zone_count: 0, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, + old_repo: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index b69473e1385..cf24ee4bb5f 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2794,7 +2794,7 @@ ] }, "BlueprintZoneImageSource": { - "description": "Where a blueprint's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", + "description": "Where the zone's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", "oneOf": [ { "description": "This zone's image source is whatever happens to be on the sled's \"install\" dataset.\n\nThis is whatever was put in place at the factory or by the latest MUPdate. The image used here can vary by sled and even over time (if the sled gets MUPdated again).\n\nHistorically, this was the only source for zone images. In an system with automated control-plane-driven update we expect to only use this variant in emergencies where the system had to be recovered via MUPdate.",