Skip to content

Commit e2b70bf

Browse files
authored
[34/n] sled-agent logic to clear mupdate overrides (#8572)
This PR implements logic within sled-agent to clear mupdate overrides. Includes tests, database storage, and displayers. This logic by itself does not introduce behavior changes, since the code to actually set this field is in #8456.
1 parent e7fe823 commit e2b70bf

File tree

26 files changed

+2124
-25
lines changed

26 files changed

+2124
-25
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,7 @@ LEDGERED SLED CONFIG
11891189
slot A details UNAVAILABLE: constructed via debug_assume_success()
11901190
slot B details UNAVAILABLE: constructed via debug_assume_success()
11911191
last reconciled config: matches ledgered config
1192+
no mupdate override to clear
11921193
no orphaned datasets
11931194
all disks reconciled successfully
11941195
all datasets reconciled successfully
@@ -1296,6 +1297,7 @@ LEDGERED SLED CONFIG
12961297
slot A details UNAVAILABLE: constructed via debug_assume_success()
12971298
slot B details UNAVAILABLE: constructed via debug_assume_success()
12981299
last reconciled config: matches ledgered config
1300+
no mupdate override to clear
12991301
no orphaned datasets
13001302
all disks reconciled successfully
13011303
all datasets reconciled successfully
@@ -1496,6 +1498,7 @@ LEDGERED SLED CONFIG
14961498
slot A details UNAVAILABLE: constructed via debug_assume_success()
14971499
slot B details UNAVAILABLE: constructed via debug_assume_success()
14981500
last reconciled config: matches ledgered config
1501+
no mupdate override to clear
14991502
no orphaned datasets
15001503
all disks reconciled successfully
15011504
all datasets reconciled successfully

dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ LEDGERED SLED CONFIG
182182
slot A details UNAVAILABLE: constructed via debug_assume_success()
183183
slot B details UNAVAILABLE: constructed via debug_assume_success()
184184
last reconciled config: matches ledgered config
185+
error reading mupdate override, so sled agent didn't attempt to clear it
185186
no orphaned datasets
186187
all disks reconciled successfully
187188
all datasets reconciled successfully
@@ -288,6 +289,7 @@ LEDGERED SLED CONFIG
288289
slot A details UNAVAILABLE: constructed via debug_assume_success()
289290
slot B details UNAVAILABLE: constructed via debug_assume_success()
290291
last reconciled config: matches ledgered config
292+
mupdate override present, but sled agent was not instructed to clear it
291293
no orphaned datasets
292294
all disks reconciled successfully
293295
all datasets reconciled successfully
@@ -383,6 +385,7 @@ LEDGERED SLED CONFIG
383385
slot A details UNAVAILABLE: constructed via debug_assume_success()
384386
slot B details UNAVAILABLE: constructed via debug_assume_success()
385387
last reconciled config: matches ledgered config
388+
mupdate override present, but sled agent was not instructed to clear it
386389
no orphaned datasets
387390
all disks reconciled successfully
388391
all datasets reconciled successfully

nexus-sled-agent-shared/src/inventory.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,10 @@ pub struct ConfigReconcilerInventory {
143143
pub orphaned_datasets: IdOrdMap<OrphanedDataset>,
144144
pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
145145
pub boot_partitions: BootPartitionContents,
146+
/// The result of clearing the mupdate override field.
147+
///
148+
/// `None` if `remove_mupdate_override` was not provided in the sled config.
149+
pub clear_mupdate_override: Option<ClearMupdateOverrideInventory>,
146150
}
147151

148152
impl ConfigReconcilerInventory {
@@ -200,6 +204,17 @@ impl ConfigReconcilerInventory {
200204
.iter()
201205
.map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
202206
.collect();
207+
let clear_mupdate_override = config.remove_mupdate_override.map(|_| {
208+
ClearMupdateOverrideInventory {
209+
boot_disk_result: Ok(
210+
ClearMupdateOverrideBootSuccessInventory::Cleared,
211+
),
212+
non_boot_message: "mupdate override successfully cleared \
213+
on non-boot disks"
214+
.to_owned(),
215+
}
216+
});
217+
203218
Self {
204219
last_reconciled_config: config,
205220
external_disks,
@@ -216,6 +231,7 @@ impl ConfigReconcilerInventory {
216231
slot_b: Err(err),
217232
}
218233
},
234+
clear_mupdate_override,
219235
}
220236
}
221237
}
@@ -277,6 +293,37 @@ impl IdOrdItem for OrphanedDataset {
277293
id_upcast!();
278294
}
279295

296+
/// Status of clearing the mupdate override in the inventory.
297+
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
298+
pub struct ClearMupdateOverrideInventory {
299+
/// The result of clearing the mupdate override on the boot disk.
300+
#[serde(with = "snake_case_result")]
301+
#[schemars(
302+
schema_with = "SnakeCaseResult::<ClearMupdateOverrideBootSuccessInventory, String>::json_schema"
303+
)]
304+
pub boot_disk_result:
305+
Result<ClearMupdateOverrideBootSuccessInventory, String>,
306+
307+
/// What happened on non-boot disks.
308+
///
309+
/// We aren't modeling this out in more detail, because we plan to not try
310+
/// and keep ledgered data in sync across both disks in the future.
311+
pub non_boot_message: String,
312+
}
313+
314+
/// Status of clearing the mupdate override on the boot disk.
315+
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
316+
#[serde(rename_all = "snake_case")]
317+
pub enum ClearMupdateOverrideBootSuccessInventory {
318+
/// The mupdate override was successfully cleared.
319+
Cleared,
320+
321+
/// No mupdate override was found.
322+
///
323+
/// This is considered a success for idempotency reasons.
324+
NoOverride,
325+
}
326+
280327
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
281328
#[serde(tag = "result", rename_all = "snake_case")]
282329
pub enum ConfigReconcilerInventoryResult {

nexus/db-model/src/inventory.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ use nexus_db_schema::schema::{
4343
};
4444
use nexus_sled_agent_shared::inventory::BootImageHeader;
4545
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
46+
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideBootSuccessInventory;
47+
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory;
4648
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
4749
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
4850
use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots;
@@ -998,6 +1000,8 @@ pub struct InvSledConfigReconciler {
9981000
boot_disk_error: Option<String>,
9991001
pub boot_partition_a_error: Option<String>,
10001002
pub boot_partition_b_error: Option<String>,
1003+
#[diesel(embed)]
1004+
pub clear_mupdate_override: InvClearMupdateOverride,
10011005
}
10021006

10031007
impl InvSledConfigReconciler {
@@ -1008,6 +1012,7 @@ impl InvSledConfigReconciler {
10081012
boot_disk: Result<M2Slot, String>,
10091013
boot_partition_a_error: Option<String>,
10101014
boot_partition_b_error: Option<String>,
1015+
clear_mupdate_override: InvClearMupdateOverride,
10111016
) -> Self {
10121017
// TODO-cleanup We should use `HwM2Slot` instead of integers for this
10131018
// column: https://github.com/oxidecomputer/omicron/issues/8642
@@ -1025,6 +1030,7 @@ impl InvSledConfigReconciler {
10251030
boot_disk_error,
10261031
boot_partition_a_error,
10271032
boot_partition_b_error,
1033+
clear_mupdate_override,
10281034
}
10291035
}
10301036

@@ -1064,6 +1070,104 @@ impl InvSledConfigReconciler {
10641070
}
10651071
}
10661072

1073+
// See [`nexus_sled_agent_shared::inventory::DbClearMupdateOverrideBootSuccess`].
1074+
impl_enum_type!(
1075+
ClearMupdateOverrideBootSuccessEnum:
1076+
1077+
#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)]
1078+
pub enum DbClearMupdateOverrideBootSuccess;
1079+
1080+
// Enum values
1081+
Cleared => b"cleared"
1082+
NoOverride => b"no-override"
1083+
);
1084+
1085+
impl From<ClearMupdateOverrideBootSuccessInventory>
1086+
for DbClearMupdateOverrideBootSuccess
1087+
{
1088+
fn from(value: ClearMupdateOverrideBootSuccessInventory) -> Self {
1089+
match value {
1090+
ClearMupdateOverrideBootSuccessInventory::Cleared => Self::Cleared,
1091+
ClearMupdateOverrideBootSuccessInventory::NoOverride => {
1092+
Self::NoOverride
1093+
}
1094+
}
1095+
}
1096+
}
1097+
1098+
impl From<DbClearMupdateOverrideBootSuccess>
1099+
for ClearMupdateOverrideBootSuccessInventory
1100+
{
1101+
fn from(value: DbClearMupdateOverrideBootSuccess) -> Self {
1102+
match value {
1103+
DbClearMupdateOverrideBootSuccess::Cleared => Self::Cleared,
1104+
DbClearMupdateOverrideBootSuccess::NoOverride => Self::NoOverride,
1105+
}
1106+
}
1107+
}
1108+
1109+
/// See [`nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory`].
1110+
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
1111+
#[diesel(table_name = inv_sled_config_reconciler)]
1112+
pub struct InvClearMupdateOverride {
1113+
#[diesel(column_name = clear_mupdate_override_boot_success)]
1114+
pub boot_success: Option<DbClearMupdateOverrideBootSuccess>,
1115+
1116+
#[diesel(column_name = clear_mupdate_override_boot_error)]
1117+
pub boot_error: Option<String>,
1118+
1119+
#[diesel(column_name = clear_mupdate_override_non_boot_message)]
1120+
pub non_boot_message: Option<String>,
1121+
}
1122+
1123+
impl InvClearMupdateOverride {
1124+
pub fn new(
1125+
clear_mupdate_override: Option<&ClearMupdateOverrideInventory>,
1126+
) -> Self {
1127+
let boot_success = clear_mupdate_override.and_then(|inv| {
1128+
inv.boot_disk_result.as_ref().ok().map(|v| v.clone().into())
1129+
});
1130+
let boot_error = clear_mupdate_override
1131+
.and_then(|inv| inv.boot_disk_result.as_ref().err().cloned());
1132+
let non_boot_message =
1133+
clear_mupdate_override.map(|inv| inv.non_boot_message.clone());
1134+
1135+
Self { boot_success, boot_error, non_boot_message }
1136+
}
1137+
1138+
pub fn into_inventory(
1139+
self,
1140+
) -> anyhow::Result<Option<ClearMupdateOverrideInventory>> {
1141+
match self {
1142+
Self {
1143+
boot_success: Some(success),
1144+
boot_error: None,
1145+
non_boot_message: Some(non_boot_message),
1146+
} => Ok(Some(ClearMupdateOverrideInventory {
1147+
boot_disk_result: Ok(success.into()),
1148+
non_boot_message,
1149+
})),
1150+
Self {
1151+
boot_success: None,
1152+
boot_error: Some(boot_error),
1153+
non_boot_message: Some(non_boot_message),
1154+
} => Ok(Some(ClearMupdateOverrideInventory {
1155+
boot_disk_result: Err(boot_error),
1156+
non_boot_message,
1157+
})),
1158+
Self {
1159+
boot_success: None,
1160+
boot_error: None,
1161+
non_boot_message: None,
1162+
} => Ok(None),
1163+
this => Err(anyhow!(
1164+
"inv_sled_config_reconciler CHECK constraint violated: \
1165+
clear mupdate override columns are not consistent: {this:?}"
1166+
)),
1167+
}
1168+
}
1169+
}
1170+
10671171
/// See [`nexus_sled_agent_shared::inventory::BootPartitionDetails`].
10681172
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
10691173
#[diesel(table_name = inv_sled_boot_partition)]

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(170, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(171, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(171, "inv-clear-mupdate-override"),
3132
KnownVersion::new(170, "add-pending-mgs-updates-rot-bootloader"),
3233
KnownVersion::new(169, "inv-ntp-timesync"),
3334
KnownVersion::new(168, "add-inv-host-phase-1-flash-hash"),

nexus/db-queries/src/db/datastore/inventory.rs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ use nexus_db_errors::public_error_from_diesel;
3232
use nexus_db_errors::public_error_from_diesel_lookup;
3333
use nexus_db_model::ArtifactHash;
3434
use nexus_db_model::HwM2Slot;
35-
use nexus_db_model::InvCaboose;
3635
use nexus_db_model::InvClickhouseKeeperMembership;
3736
use nexus_db_model::InvCockroachStatus;
3837
use nexus_db_model::InvCollection;
@@ -73,6 +72,7 @@ use nexus_db_model::{
7372
};
7473
use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot};
7574
use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot};
75+
use nexus_db_model::{InvCaboose, InvClearMupdateOverride};
7676
use nexus_db_schema::enums::HwM2SlotEnum;
7777
use nexus_db_schema::enums::HwRotSlotEnum;
7878
use nexus_db_schema::enums::RotImageErrorEnum;
@@ -3734,6 +3734,13 @@ impl DataStore {
37343734
BootPartitionContents { boot_disk, slot_a, slot_b }
37353735
};
37363736

3737+
let clear_mupdate_override = reconciler
3738+
.clear_mupdate_override
3739+
.into_inventory()
3740+
.map_err(|err| {
3741+
Error::internal_error(&format!("{err:#}"))
3742+
})?;
3743+
37373744
Ok::<_, Error>(ConfigReconcilerInventory {
37383745
last_reconciled_config,
37393746
external_disks: last_reconciliation_disk_results
@@ -3750,6 +3757,7 @@ impl DataStore {
37503757
.remove(&sled_id)
37513758
.unwrap_or_default(),
37523759
boot_partitions,
3760+
clear_mupdate_override,
37533761
})
37543762
})
37553763
.transpose()?;
@@ -3972,6 +3980,9 @@ impl ConfigReconcilerRows {
39723980
)?
39733981
};
39743982
last_reconciliation_config_id = Some(last_reconciled_config);
3983+
let clear_mupdate_override = InvClearMupdateOverride::new(
3984+
last_reconciliation.clear_mupdate_override.as_ref(),
3985+
);
39753986

39763987
self.config_reconcilers.push(InvSledConfigReconciler::new(
39773988
collection_id,
@@ -3990,6 +4001,7 @@ impl ConfigReconcilerRows {
39904001
.as_ref()
39914002
.err()
39924003
.cloned(),
4004+
clear_mupdate_override,
39934005
));
39944006

39954007
// Boot partition _errors_ are kept in `InvSledConfigReconciler`
@@ -4238,10 +4250,13 @@ mod test {
42384250
use nexus_inventory::examples::Representative;
42394251
use nexus_inventory::examples::representative;
42404252
use nexus_inventory::now_db_precision;
4241-
use nexus_sled_agent_shared::inventory::BootImageHeader;
42424253
use nexus_sled_agent_shared::inventory::BootPartitionContents;
42434254
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
42444255
use nexus_sled_agent_shared::inventory::OrphanedDataset;
4256+
use nexus_sled_agent_shared::inventory::{
4257+
BootImageHeader, ClearMupdateOverrideBootSuccessInventory,
4258+
ClearMupdateOverrideInventory,
4259+
};
42454260
use nexus_sled_agent_shared::inventory::{
42464261
ConfigReconcilerInventory, ConfigReconcilerInventoryResult,
42474262
ConfigReconcilerInventoryStatus, OmicronZoneImageSource,
@@ -5103,6 +5118,15 @@ mod test {
51035118
artifact_size: 456789,
51045119
}),
51055120
},
5121+
clear_mupdate_override: Some(
5122+
ClearMupdateOverrideInventory {
5123+
boot_disk_result: Ok(
5124+
ClearMupdateOverrideBootSuccessInventory::Cleared,
5125+
),
5126+
non_boot_message: "simulated non-boot message"
5127+
.to_owned(),
5128+
},
5129+
),
51065130
}
51075131
});
51085132

nexus/db-schema/src/enums.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ define_enums! {
3333
BpZoneDispositionEnum => "bp_zone_disposition",
3434
BpZoneImageSourceEnum => "bp_zone_image_source",
3535
CabooseWhichEnum => "caboose_which",
36+
ClearMupdateOverrideBootSuccessEnum => "clear_mupdate_override_boot_success",
3637
ClickhouseModeEnum => "clickhouse_mode",
3738
DatasetKindEnum => "dataset_kind",
3839
DnsGroupEnum => "dns_group",

nexus/db-schema/src/schema.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,10 @@ table! {
16391639

16401640
boot_partition_a_error -> Nullable<Text>,
16411641
boot_partition_b_error -> Nullable<Text>,
1642+
1643+
clear_mupdate_override_boot_success -> Nullable<crate::enums::ClearMupdateOverrideBootSuccessEnum>,
1644+
clear_mupdate_override_boot_error -> Nullable<Text>,
1645+
clear_mupdate_override_non_boot_message -> Nullable<Text>,
16421646
}
16431647
}
16441648

0 commit comments

Comments
 (0)