Skip to content

Commit dbb6669

Browse files
committed
Capture SP task dumps in support bundles
Update the support bundle collector to capture task dumps from the SPs.
1 parent 69a8d6b commit dbb6669

File tree

2 files changed

+170
-19
lines changed

2 files changed

+170
-19
lines changed

nexus/src/app/background/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ impl BackgroundTasksInitializer {
505505
task_impl: Box::new(
506506
support_bundle_collector::SupportBundleCollector::new(
507507
datastore.clone(),
508+
resolver.clone(),
508509
config.support_bundle_collector.disable,
509510
nexus_id,
510511
),

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 169 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
use crate::app::background::BackgroundTask;
88
use anyhow::Context;
9+
use base64::Engine;
910
use camino::Utf8DirEntry;
1011
use camino::Utf8Path;
1112
use camino_tempfile::Utf8TempDir;
@@ -15,6 +16,10 @@ use futures::FutureExt;
1516
use futures::StreamExt;
1617
use futures::future::BoxFuture;
1718
use futures::stream::FuturesUnordered;
19+
use gateway_client::Client as MgsClient;
20+
use gateway_client::types::SpIdentifier;
21+
use internal_dns_resolver::Resolver;
22+
use internal_dns_types::names::ServiceName;
1823
use nexus_db_model::SupportBundle;
1924
use nexus_db_model::SupportBundleState;
2025
use nexus_db_queries::authz;
@@ -36,6 +41,7 @@ use omicron_uuid_kinds::SupportBundleUuid;
3641
use omicron_uuid_kinds::ZpoolUuid;
3742
use serde_json::json;
3843
use sha2::{Digest, Sha256};
44+
use slog_error_chain::InlineErrorChain;
3945
use std::future::Future;
4046
use std::io::Write;
4147
use std::sync::Arc;
@@ -84,17 +90,19 @@ enum DatabaseBundleCleanupResult {
8490
/// The background task responsible for cleaning and collecting support bundles
8591
pub struct SupportBundleCollector {
8692
datastore: Arc<DataStore>,
93+
resolver: Resolver,
8794
disable: bool,
8895
nexus_id: OmicronZoneUuid,
8996
}
9097

9198
impl SupportBundleCollector {
9299
pub fn new(
93100
datastore: Arc<DataStore>,
101+
resolver: Resolver,
94102
disable: bool,
95103
nexus_id: OmicronZoneUuid,
96104
) -> Self {
97-
SupportBundleCollector { datastore, disable, nexus_id }
105+
SupportBundleCollector { datastore, resolver, disable, nexus_id }
98106
}
99107

100108
// Tells a sled agent to delete a support bundle
@@ -376,6 +384,7 @@ impl SupportBundleCollector {
376384

377385
let collection = Arc::new(BundleCollection {
378386
datastore: self.datastore.clone(),
387+
resolver: self.resolver.clone(),
379388
log: opctx.log.new(slog::o!("bundle" => bundle.id.to_string())),
380389
opctx: opctx.child(std::collections::BTreeMap::new()),
381390
request: request.clone(),
@@ -419,6 +428,7 @@ impl SupportBundleCollector {
419428
// Wraps up all arguments to perform a single support bundle collection
420429
struct BundleCollection {
421430
datastore: Arc<DataStore>,
431+
resolver: Resolver,
422432
log: slog::Logger,
423433
opctx: OpContext,
424434
request: BundleRequest,
@@ -558,6 +568,13 @@ impl BundleCollection {
558568
)
559569
.await?;
560570

571+
let sp_dumps_dir = dir.path().join("sp_task_dumps");
572+
tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
573+
format!("failed to create SP task dump directory {sp_dumps_dir}")
574+
})?;
575+
let sp_dumps_fut =
576+
save_all_sp_dumps(log, &self.resolver, &sp_dumps_dir);
577+
561578
if let Ok(all_sleds) = self
562579
.datastore
563580
.sled_list_all_batched(&self.opctx, SledFilter::InService)
@@ -605,6 +622,15 @@ impl BundleCollection {
605622
}
606623
}
607624

625+
let sp_dumps_dir = dir.path().join("sp_task_dumps");
626+
tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
627+
format!("failed to create SP task dump directory {sp_dumps_dir}")
628+
})?;
629+
630+
if let Err(e) = sp_dumps_fut.await {
631+
error!(log, "failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref()));
632+
};
633+
608634
Ok(report)
609635
}
610636

@@ -981,6 +1007,85 @@ where
9811007
Ok(())
9821008
}
9831009

1010+
/// Collect task dumps from all SPs via MGS and save them to a directory.
1011+
async fn save_all_sp_dumps(
1012+
log: &slog::Logger,
1013+
resolver: &Resolver,
1014+
sp_dumps_dir: &Utf8Path,
1015+
) -> anyhow::Result<()> {
1016+
let mgs_client = resolver
1017+
.lookup_socket_v6(ServiceName::ManagementGatewayService)
1018+
.await
1019+
.map(|sockaddr| {
1020+
let url = format!("http://{}", sockaddr);
1021+
gateway_client::Client::new(&url, log.clone())
1022+
})
1023+
.context("failed to resolve address of MGS")?;
1024+
1025+
let all_sps = mgs_client
1026+
.sp_all_ids()
1027+
.await
1028+
.context("failed to get list of SPs from MGS")?
1029+
.into_inner();
1030+
1031+
let mut futures = futures::stream::iter(all_sps.into_iter())
1032+
.map(|sp| {
1033+
let mgs_client = mgs_client.clone();
1034+
1035+
async move {
1036+
save_sp_dumps(mgs_client, sp, &sp_dumps_dir)
1037+
.await
1038+
.with_context(|| format!("SP {} {}", sp.type_, sp.slot))
1039+
}
1040+
})
1041+
.buffer_unordered(10);
1042+
1043+
while let Some(result) = futures.next().await {
1044+
if let Err(e) = result {
1045+
error!(
1046+
log,
1047+
"failed to capture task dumps";
1048+
"error" => InlineErrorChain::new(e.as_ref())
1049+
);
1050+
}
1051+
}
1052+
1053+
Ok(())
1054+
}
1055+
1056+
/// Fetch and save task dumps from a single SP.
1057+
async fn save_sp_dumps(
1058+
mgs_client: MgsClient,
1059+
sp: SpIdentifier,
1060+
sp_dumps_dir: &Utf8Path,
1061+
) -> anyhow::Result<()> {
1062+
let dump_count = mgs_client
1063+
.sp_task_dump_count(sp.type_, sp.slot)
1064+
.await
1065+
.context("failed to get task dump count from SP")?
1066+
.into_inner();
1067+
1068+
let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot));
1069+
tokio::fs::create_dir_all(&output_dir).await?;
1070+
1071+
for i in 0..dump_count {
1072+
let task_dump = mgs_client
1073+
.sp_task_dump_get(sp.type_, sp.slot, i)
1074+
.await
1075+
.with_context(|| format!("failed to get task dump {i} from SP"))?
1076+
.into_inner();
1077+
1078+
let zip_bytes = base64::engine::general_purpose::STANDARD
1079+
.decode(task_dump.base64_zip)
1080+
.context("failed to decode base64-encoded SP task dump zip")?;
1081+
1082+
tokio::fs::write(output_dir.join(format!("dump-{i}.zip")), zip_bytes)
1083+
.await
1084+
.context("failed to write SP task dump zip to disk")?;
1085+
}
1086+
Ok(())
1087+
}
1088+
9841089
#[cfg(test)]
9851090
mod test {
9861091
use super::*;
@@ -1037,12 +1142,17 @@ mod test {
10371142
async fn test_cleanup_noop(cptestctx: &ControlPlaneTestContext) {
10381143
let nexus = &cptestctx.server.server_context().nexus;
10391144
let datastore = nexus.datastore();
1145+
let resolver = nexus.resolver();
10401146
let opctx = OpContext::for_tests(
10411147
cptestctx.logctx.log.clone(),
10421148
datastore.clone(),
10431149
);
1044-
let collector =
1045-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1150+
let collector = SupportBundleCollector::new(
1151+
datastore.clone(),
1152+
resolver.clone(),
1153+
false,
1154+
nexus.id(),
1155+
);
10461156

10471157
let report = collector
10481158
.cleanup_destroyed_bundles(&opctx)
@@ -1058,12 +1168,17 @@ mod test {
10581168
async fn test_collect_noop(cptestctx: &ControlPlaneTestContext) {
10591169
let nexus = &cptestctx.server.server_context().nexus;
10601170
let datastore = nexus.datastore();
1171+
let resolver = nexus.resolver();
10611172
let opctx = OpContext::for_tests(
10621173
cptestctx.logctx.log.clone(),
10631174
datastore.clone(),
10641175
);
1065-
let collector =
1066-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1176+
let collector = SupportBundleCollector::new(
1177+
datastore.clone(),
1178+
resolver.clone(),
1179+
false,
1180+
nexus.id(),
1181+
);
10671182

10681183
let request = BundleRequest::default();
10691184
let report = collector
@@ -1224,6 +1339,7 @@ mod test {
12241339
async fn test_collect_one(cptestctx: &ControlPlaneTestContext) {
12251340
let nexus = &cptestctx.server.server_context().nexus;
12261341
let datastore = nexus.datastore();
1342+
let resolver = nexus.resolver();
12271343
let opctx = OpContext::for_tests(
12281344
cptestctx.logctx.log.clone(),
12291345
datastore.clone(),
@@ -1242,8 +1358,12 @@ mod test {
12421358
.expect("Couldn't allocate a support bundle");
12431359
assert_eq!(bundle.state, SupportBundleState::Collecting);
12441360

1245-
let collector =
1246-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1361+
let collector = SupportBundleCollector::new(
1362+
datastore.clone(),
1363+
resolver.clone(),
1364+
false,
1365+
nexus.id(),
1366+
);
12471367

12481368
// The bundle collection should complete successfully.
12491369
let request = BundleRequest {
@@ -1279,6 +1399,7 @@ mod test {
12791399
async fn test_collect_many(cptestctx: &ControlPlaneTestContext) {
12801400
let nexus = &cptestctx.server.server_context().nexus;
12811401
let datastore = nexus.datastore();
1402+
let resolver = nexus.resolver();
12821403
let opctx = OpContext::for_tests(
12831404
cptestctx.logctx.log.clone(),
12841405
datastore.clone(),
@@ -1299,8 +1420,12 @@ mod test {
12991420
.await
13001421
.expect("Couldn't allocate a second support bundle");
13011422

1302-
let collector =
1303-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1423+
let collector = SupportBundleCollector::new(
1424+
datastore.clone(),
1425+
resolver.clone(),
1426+
false,
1427+
nexus.id(),
1428+
);
13041429

13051430
// Each time we call "collect_bundle", we collect a SINGLE bundle.
13061431
let request = BundleRequest { skip_sled_info: true };
@@ -1355,6 +1480,7 @@ mod test {
13551480
) {
13561481
let nexus = &cptestctx.server.server_context().nexus;
13571482
let datastore = nexus.datastore();
1483+
let resolver = nexus.resolver();
13581484
let opctx = OpContext::for_tests(
13591485
cptestctx.logctx.log.clone(),
13601486
datastore.clone(),
@@ -1384,8 +1510,12 @@ mod test {
13841510
.await
13851511
.unwrap();
13861512

1387-
let collector =
1388-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1513+
let collector = SupportBundleCollector::new(
1514+
datastore.clone(),
1515+
resolver.clone(),
1516+
false,
1517+
nexus.id(),
1518+
);
13891519

13901520
let report = collector
13911521
.cleanup_destroyed_bundles(&opctx)
@@ -1410,6 +1540,7 @@ mod test {
14101540
) {
14111541
let nexus = &cptestctx.server.server_context().nexus;
14121542
let datastore = nexus.datastore();
1543+
let resolver = nexus.resolver();
14131544
let opctx = OpContext::for_tests(
14141545
cptestctx.logctx.log.clone(),
14151546
datastore.clone(),
@@ -1427,8 +1558,12 @@ mod test {
14271558
.expect("Couldn't allocate a support bundle");
14281559
assert_eq!(bundle.state, SupportBundleState::Collecting);
14291560

1430-
let collector =
1431-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1561+
let collector = SupportBundleCollector::new(
1562+
datastore.clone(),
1563+
resolver.clone(),
1564+
false,
1565+
nexus.id(),
1566+
);
14321567
let request = BundleRequest { skip_sled_info: true };
14331568
let report = collector
14341569
.collect_bundle(&opctx, &request)
@@ -1475,6 +1610,7 @@ mod test {
14751610
) {
14761611
let nexus = &cptestctx.server.server_context().nexus;
14771612
let datastore = nexus.datastore();
1613+
let resolver = nexus.resolver();
14781614
let opctx = OpContext::for_tests(
14791615
cptestctx.logctx.log.clone(),
14801616
datastore.clone(),
@@ -1506,8 +1642,12 @@ mod test {
15061642
.await
15071643
.unwrap();
15081644

1509-
let collector =
1510-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1645+
let collector = SupportBundleCollector::new(
1646+
datastore.clone(),
1647+
resolver.clone(),
1648+
false,
1649+
nexus.id(),
1650+
);
15111651

15121652
let report = collector
15131653
.cleanup_destroyed_bundles(&opctx)
@@ -1535,6 +1675,7 @@ mod test {
15351675
) {
15361676
let nexus = &cptestctx.server.server_context().nexus;
15371677
let datastore = nexus.datastore();
1678+
let resolver = nexus.resolver();
15381679
let opctx = OpContext::for_tests(
15391680
cptestctx.logctx.log.clone(),
15401681
datastore.clone(),
@@ -1552,8 +1693,12 @@ mod test {
15521693
.expect("Couldn't allocate a support bundle");
15531694
assert_eq!(bundle.state, SupportBundleState::Collecting);
15541695

1555-
let collector =
1556-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1696+
let collector = SupportBundleCollector::new(
1697+
datastore.clone(),
1698+
resolver.clone(),
1699+
false,
1700+
nexus.id(),
1701+
);
15571702
let request = BundleRequest { skip_sled_info: true };
15581703
let report = collector
15591704
.collect_bundle(&opctx, &request)
@@ -1609,6 +1754,7 @@ mod test {
16091754
) {
16101755
let nexus = &cptestctx.server.server_context().nexus;
16111756
let datastore = nexus.datastore();
1757+
let resolver = nexus.resolver();
16121758
let opctx = OpContext::for_tests(
16131759
cptestctx.logctx.log.clone(),
16141760
datastore.clone(),
@@ -1626,8 +1772,12 @@ mod test {
16261772
.expect("Couldn't allocate a support bundle");
16271773
assert_eq!(bundle.state, SupportBundleState::Collecting);
16281774

1629-
let collector =
1630-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1775+
let collector = SupportBundleCollector::new(
1776+
datastore.clone(),
1777+
resolver.clone(),
1778+
false,
1779+
nexus.id(),
1780+
);
16311781
let request = BundleRequest { skip_sled_info: true };
16321782
let report = collector
16331783
.collect_bundle(&opctx, &request)

0 commit comments

Comments
 (0)