Skip to content

Commit 40fc383

Browse files
authored
Fix blueprint planner's validation of PlanningInput external networking (#6599)
The "Check the planning input" block of code needs to consider _all_ zones in the parent blueprint, including expunged zones. #6483 fixed the planner's ability to reuse external IPs from expunged zones, but accidentally made these checks incorrect, because it's certainly valid for the planning input to still have records for expunged zones. See #6581 (comment) for more context. We also get to remove the somewhat-awkward `update_network_resources_from_blueprint()` test helper that was needed to make some tests pass (because we didn't realize the checks being performed here were wrong).
1 parent 88030ff commit 40fc383

File tree

6 files changed

+279
-150
lines changed

6 files changed

+279
-150
lines changed

nexus/reconfigurator/planning/src/blueprint_builder/builder.rs

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1542,6 +1542,7 @@ pub mod test {
15421542
pub const DEFAULT_N_SLEDS: usize = 3;
15431543

15441544
/// Checks various conditions that should be true for all blueprints
1545+
#[track_caller]
15451546
pub fn verify_blueprint(blueprint: &Blueprint) {
15461547
// There should be no duplicate underlay IPs.
15471548
let mut underlay_ips: BTreeMap<Ipv6Addr, &BlueprintZoneConfig> =
@@ -1592,6 +1593,28 @@ pub mod test {
15921593
}
15931594
}
15941595

1596+
#[track_caller]
1597+
pub fn assert_planning_makes_no_changes(
1598+
log: &Logger,
1599+
blueprint: &Blueprint,
1600+
input: &PlanningInput,
1601+
test_name: &'static str,
1602+
) {
1603+
let builder =
1604+
BlueprintBuilder::new_based_on(&log, &blueprint, &input, test_name)
1605+
.expect("failed to create builder");
1606+
let child_blueprint = builder.build();
1607+
verify_blueprint(&child_blueprint);
1608+
let diff = child_blueprint.diff_since_blueprint(&blueprint);
1609+
println!(
1610+
"diff between blueprints (expected no changes):\n{}",
1611+
diff.display()
1612+
);
1613+
assert_eq!(diff.sleds_added.len(), 0);
1614+
assert_eq!(diff.sleds_removed.len(), 0);
1615+
assert_eq!(diff.sleds_modified.len(), 0);
1616+
}
1617+
15951618
#[test]
15961619
fn test_initial() {
15971620
// Test creating a blueprint from a collection and verifying that it
@@ -1619,24 +1642,13 @@ pub mod test {
16191642
assert_eq!(diff.sleds_removed.len(), 0);
16201643
assert_eq!(diff.sleds_modified.len(), 0);
16211644

1622-
// Test a no-op blueprint.
1623-
let builder = BlueprintBuilder::new_based_on(
1645+
// Test a no-op planning iteration.
1646+
assert_planning_makes_no_changes(
16241647
&logctx.log,
16251648
&blueprint_initial,
16261649
&input,
1627-
"test_basic",
1628-
)
1629-
.expect("failed to create builder");
1630-
let blueprint = builder.build();
1631-
verify_blueprint(&blueprint);
1632-
let diff = blueprint.diff_since_blueprint(&blueprint_initial);
1633-
println!(
1634-
"initial blueprint -> next blueprint (expected no changes):\n{}",
1635-
diff.display()
1650+
TEST_NAME,
16361651
);
1637-
assert_eq!(diff.sleds_added.len(), 0);
1638-
assert_eq!(diff.sleds_removed.len(), 0);
1639-
assert_eq!(diff.sleds_modified.len(), 0);
16401652

16411653
logctx.cleanup_successful();
16421654
}
@@ -1783,6 +1795,14 @@ pub mod test {
17831795
.collect()
17841796
);
17851797

1798+
// Test a no-op planning iteration.
1799+
assert_planning_makes_no_changes(
1800+
&logctx.log,
1801+
&blueprint3,
1802+
&input,
1803+
TEST_NAME,
1804+
);
1805+
17861806
logctx.cleanup_successful();
17871807
}
17881808

@@ -1860,6 +1880,14 @@ pub mod test {
18601880
None,
18611881
);
18621882

1883+
// Test a no-op planning iteration.
1884+
assert_planning_makes_no_changes(
1885+
&logctx.log,
1886+
&blueprint3,
1887+
&input,
1888+
TEST_NAME,
1889+
);
1890+
18631891
logctx.cleanup_successful();
18641892
}
18651893

@@ -2367,6 +2395,14 @@ pub mod test {
23672395
num_sled_zpools
23682396
);
23692397

2398+
// Test a no-op planning iteration.
2399+
assert_planning_makes_no_changes(
2400+
&logctx.log,
2401+
&blueprint,
2402+
&input,
2403+
TEST_NAME,
2404+
);
2405+
23702406
// If we instead ask for one more zone than there are zpools, we should
23712407
// get a zpool allocation error.
23722408
let mut builder = BlueprintBuilder::new_based_on(

nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs

Lines changed: 148 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -152,74 +152,17 @@ impl<'a> BuilderExternalNetworking<'a> {
152152
// Check the planning input: there shouldn't be any external networking
153153
// resources in the database (the source of `input`) that we don't know
154154
// about from the parent blueprint.
155-
for external_ip_entry in
156-
input.network_resources().omicron_zone_external_ips()
157-
{
158-
// As above, ignore localhost (used by the test suite).
159-
if external_ip_entry.ip.ip().is_loopback() {
160-
continue;
161-
}
162-
if !external_ip_alloc.contains(&external_ip_entry.ip)? {
163-
bail!(
164-
"planning input contains unexpected external IP \
165-
(IP not found in parent blueprint): {external_ip_entry:?}"
166-
);
167-
}
168-
}
169-
for nic_entry in input.network_resources().omicron_zone_nics() {
170-
if !used_macs.contains(&nic_entry.nic.mac) {
171-
bail!(
172-
"planning input contains unexpected NIC \
173-
(MAC not found in parent blueprint): {nic_entry:?}"
174-
);
175-
}
176-
match nic_entry.nic.ip {
177-
IpAddr::V4(ip) if NEXUS_OPTE_IPV4_SUBNET.contains(ip) => {
178-
if !existing_nexus_v4_ips.contains(&ip) {
179-
bail!(
180-
"planning input contains unexpected NIC \
181-
(IP not found in parent blueprint): {nic_entry:?}"
182-
);
183-
}
184-
}
185-
IpAddr::V4(ip) if NTP_OPTE_IPV4_SUBNET.contains(ip) => {
186-
if !existing_boundary_ntp_v4_ips.contains(&ip) {
187-
bail!(
188-
"planning input contains unexpected NIC \
189-
(IP not found in parent blueprint): {nic_entry:?}"
190-
);
191-
}
192-
}
193-
IpAddr::V4(ip) if DNS_OPTE_IPV4_SUBNET.contains(ip) => {
194-
// TODO check existing_dns_v4_ips, once it exists
195-
}
196-
IpAddr::V6(ip) if NEXUS_OPTE_IPV6_SUBNET.contains(ip) => {
197-
if !existing_nexus_v6_ips.contains(&ip) {
198-
bail!(
199-
"planning input contains unexpected NIC \
200-
(IP not found in parent blueprint): {nic_entry:?}"
201-
);
202-
}
203-
}
204-
IpAddr::V6(ip) if NTP_OPTE_IPV6_SUBNET.contains(ip) => {
205-
if !existing_boundary_ntp_v6_ips.contains(&ip) {
206-
bail!(
207-
"planning input contains unexpected NIC \
208-
(IP not found in parent blueprint): {nic_entry:?}"
209-
);
210-
}
211-
}
212-
IpAddr::V6(ip) if DNS_OPTE_IPV6_SUBNET.contains(ip) => {
213-
// TODO check existing_dns_v6_ips, once it exists
214-
}
215-
_ => {
216-
bail!(
217-
"planning input contains unexpected NIC \
218-
(IP not contained in known OPTE subnet): {nic_entry:?}"
219-
)
220-
}
221-
}
222-
}
155+
//
156+
// Logically this could be the first thing we do in this function, but
157+
// we have some tests that check error cases in the above block that
158+
// would also fail these checks (so reordering the checks would require
159+
// those tests to do more work to construct valid `input`s), and we
160+
// never expect this to fail in practice, so there's no use in "failing
161+
// fast".
162+
ensure_input_records_appear_in_parent_blueprint(
163+
parent_blueprint,
164+
input,
165+
)?;
223166

224167
// TODO-performance Building these iterators as "walk through the list
225168
// and skip anything we've used already" is fine as long as we're
@@ -333,6 +276,142 @@ impl<'a> BuilderExternalNetworking<'a> {
333276
}
334277
}
335278

279+
// Helper to validate that the system hasn't gone off the rails. There should
280+
// never be any external networking resources in the planning input (which is
281+
// derived from the contents of CRDB) that we don't know about from the parent
282+
// blueprint. It's possible a given planning iteration could see such a state
283+
// there have been intermediate changes made by other Nexus instances; e.g.,
284+
//
285+
// 1. Nexus A generates a `PlanningInput` by reading from CRDB
286+
// 2. Nexus B executes on a target blueprint that removes IPs/NICs from
287+
// CRDB
288+
// 3. Nexus B regenerates a new blueprint and prunes the zone(s) associated
289+
// with the IPs/NICs from step 2
290+
// 4. Nexus B makes this new blueprint the target
291+
// 5. Nexus A attempts to run planning with its `PlanningInput` from step 1 but
292+
// the target blueprint from step 4; this will fail the following checks
293+
// because the input contains records that were removed in step 3
294+
//
295+
// We do not need to handle this class of error; it's a transient failure that
296+
// will clear itself up when Nexus A repeats its planning loop from the top and
297+
// generates a new `PlanningInput`.
298+
//
299+
// There may still be database records corresponding to _expunged_ zones, but
300+
// that's okay: it just means we haven't yet realized a blueprint where those
301+
// zones are expunged. And those should should still be in the blueprint (not
302+
// pruned) until their database records are cleaned up.
303+
//
304+
// It's also possible that there may be networking records in the database
305+
// assigned to zones that have been expunged, and our parent blueprint uses
306+
// those same records for new zones. This is also fine and expected, and is a
307+
// similar case to the previous paragraph: a zone with networking resources was
308+
// expunged, the database doesn't realize it yet, but can still move forward and
309+
// make planning decisions that reuse those resources for new zones.
310+
fn ensure_input_records_appear_in_parent_blueprint(
311+
parent_blueprint: &Blueprint,
312+
input: &PlanningInput,
313+
) -> anyhow::Result<()> {
314+
let mut all_macs: HashSet<MacAddr> = HashSet::new();
315+
let mut all_nexus_nic_ips: HashSet<IpAddr> = HashSet::new();
316+
let mut all_boundary_ntp_nic_ips: HashSet<IpAddr> = HashSet::new();
317+
let mut all_external_ips: HashSet<OmicronZoneExternalIp> = HashSet::new();
318+
319+
// Unlike the construction of the external IP allocator and existing IPs
320+
// constructed above in `BuilderExternalNetworking::new()`, we do not
321+
// check for duplicates here: we could very well see reuse of IPs
322+
// between expunged zones or between expunged -> running zones.
323+
for (_, z) in parent_blueprint.all_omicron_zones(BlueprintZoneFilter::All) {
324+
let zone_type = &z.zone_type;
325+
match zone_type {
326+
BlueprintZoneType::BoundaryNtp(ntp) => {
327+
all_boundary_ntp_nic_ips.insert(ntp.nic.ip);
328+
}
329+
BlueprintZoneType::Nexus(nexus) => {
330+
all_nexus_nic_ips.insert(nexus.nic.ip);
331+
}
332+
// TODO: external-dns
333+
_ => (),
334+
}
335+
336+
if let Some((external_ip, nic)) = zone_type.external_networking() {
337+
// As above, ignore localhost (used by the test suite).
338+
if !external_ip.ip().is_loopback() {
339+
all_external_ips.insert(external_ip);
340+
}
341+
all_macs.insert(nic.mac);
342+
}
343+
}
344+
for external_ip_entry in
345+
input.network_resources().omicron_zone_external_ips()
346+
{
347+
// As above, ignore localhost (used by the test suite).
348+
if external_ip_entry.ip.ip().is_loopback() {
349+
continue;
350+
}
351+
if !all_external_ips.contains(&external_ip_entry.ip) {
352+
bail!(
353+
"planning input contains unexpected external IP \
354+
(IP not found in parent blueprint): {external_ip_entry:?}"
355+
);
356+
}
357+
}
358+
for nic_entry in input.network_resources().omicron_zone_nics() {
359+
if !all_macs.contains(&nic_entry.nic.mac) {
360+
bail!(
361+
"planning input contains unexpected NIC \
362+
(MAC not found in parent blueprint): {nic_entry:?}"
363+
);
364+
}
365+
match nic_entry.nic.ip {
366+
IpAddr::V4(ip) if NEXUS_OPTE_IPV4_SUBNET.contains(ip) => {
367+
if !all_nexus_nic_ips.contains(&ip.into()) {
368+
bail!(
369+
"planning input contains unexpected NIC \
370+
(IP not found in parent blueprint): {nic_entry:?}"
371+
);
372+
}
373+
}
374+
IpAddr::V4(ip) if NTP_OPTE_IPV4_SUBNET.contains(ip) => {
375+
if !all_boundary_ntp_nic_ips.contains(&ip.into()) {
376+
bail!(
377+
"planning input contains unexpected NIC \
378+
(IP not found in parent blueprint): {nic_entry:?}"
379+
);
380+
}
381+
}
382+
IpAddr::V4(ip) if DNS_OPTE_IPV4_SUBNET.contains(ip) => {
383+
// TODO check all_dns_nic_ips, once it exists
384+
}
385+
IpAddr::V6(ip) if NEXUS_OPTE_IPV6_SUBNET.contains(ip) => {
386+
if !all_nexus_nic_ips.contains(&ip.into()) {
387+
bail!(
388+
"planning input contains unexpected NIC \
389+
(IP not found in parent blueprint): {nic_entry:?}"
390+
);
391+
}
392+
}
393+
IpAddr::V6(ip) if NTP_OPTE_IPV6_SUBNET.contains(ip) => {
394+
if !all_boundary_ntp_nic_ips.contains(&ip.into()) {
395+
bail!(
396+
"planning input contains unexpected NIC \
397+
(IP not found in parent blueprint): {nic_entry:?}"
398+
);
399+
}
400+
}
401+
IpAddr::V6(ip) if DNS_OPTE_IPV6_SUBNET.contains(ip) => {
402+
// TODO check all_dns_nic_ips, once it exists
403+
}
404+
_ => {
405+
bail!(
406+
"planning input contains unexpected NIC \
407+
(IP not contained in known OPTE subnet): {nic_entry:?}"
408+
)
409+
}
410+
}
411+
}
412+
Ok(())
413+
}
414+
336415
#[derive(Debug, Clone, Copy)]
337416
pub(super) struct ExternalNetworkingChoice {
338417
pub(super) external_ip: IpAddr,
@@ -440,6 +519,7 @@ impl<'a> ExternalIpAllocator<'a> {
440519
// Returns `Ok(true)` if we contain this IP exactly, `Ok(false)` if we do
441520
// not contain this IP, and an error if we contain a matching IP address but
442521
// a mismatched exclusiveness / SNAT-ness.
522+
#[cfg(test)]
443523
fn contains(
444524
&self,
445525
external_ip: &OmicronZoneExternalIp,

nexus/reconfigurator/planning/src/example.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ impl ExampleSystem {
4343
let _ = system.sled(SledBuilder::new().id(*sled_id)).unwrap();
4444
}
4545

46-
let mut input_builder = system
46+
let input_builder = system
4747
.to_planning_input_builder()
4848
.expect("failed to make planning input builder");
4949
let base_input = input_builder.clone().build();
@@ -92,10 +92,6 @@ impl ExampleSystem {
9292
system.to_collection_builder().expect("failed to build collection");
9393
builder.set_rng_seed((test_name, "ExampleSystem collection"));
9494

95-
input_builder
96-
.update_network_resources_from_blueprint(&blueprint)
97-
.expect("failed to add network resources from blueprint");
98-
9995
for (sled_id, zones) in &blueprint.blueprint_zones {
10096
builder
10197
.found_sled_omicron_zones(

0 commit comments

Comments
 (0)