6
6
7
7
use crate :: app:: background:: BackgroundTask ;
8
8
use anyhow:: Context ;
9
+ use base64:: Engine ;
9
10
use camino:: Utf8DirEntry ;
10
11
use camino:: Utf8Path ;
11
12
use camino_tempfile:: Utf8TempDir ;
@@ -15,6 +16,10 @@ use futures::FutureExt;
15
16
use futures:: StreamExt ;
16
17
use futures:: future:: BoxFuture ;
17
18
use futures:: stream:: FuturesUnordered ;
19
+ use gateway_client:: Client as MgsClient ;
20
+ use gateway_client:: types:: SpIdentifier ;
21
+ use internal_dns_resolver:: Resolver ;
22
+ use internal_dns_types:: names:: ServiceName ;
18
23
use nexus_db_model:: SupportBundle ;
19
24
use nexus_db_model:: SupportBundleState ;
20
25
use nexus_db_queries:: authz;
@@ -36,6 +41,7 @@ use omicron_uuid_kinds::SupportBundleUuid;
36
41
use omicron_uuid_kinds:: ZpoolUuid ;
37
42
use serde_json:: json;
38
43
use sha2:: { Digest , Sha256 } ;
44
+ use slog_error_chain:: InlineErrorChain ;
39
45
use std:: future:: Future ;
40
46
use std:: io:: Write ;
41
47
use std:: sync:: Arc ;
@@ -84,17 +90,19 @@ enum DatabaseBundleCleanupResult {
84
90
/// The background task responsible for cleaning and collecting support bundles
85
91
pub struct SupportBundleCollector {
86
92
datastore : Arc < DataStore > ,
93
+ resolver : Resolver ,
87
94
disable : bool ,
88
95
nexus_id : OmicronZoneUuid ,
89
96
}
90
97
91
98
impl SupportBundleCollector {
92
99
pub fn new (
93
100
datastore : Arc < DataStore > ,
101
+ resolver : Resolver ,
94
102
disable : bool ,
95
103
nexus_id : OmicronZoneUuid ,
96
104
) -> Self {
97
- SupportBundleCollector { datastore, disable, nexus_id }
105
+ SupportBundleCollector { datastore, resolver , disable, nexus_id }
98
106
}
99
107
100
108
// Tells a sled agent to delete a support bundle
@@ -376,6 +384,7 @@ impl SupportBundleCollector {
376
384
377
385
let collection = Arc :: new ( BundleCollection {
378
386
datastore : self . datastore . clone ( ) ,
387
+ resolver : self . resolver . clone ( ) ,
379
388
log : opctx. log . new ( slog:: o!( "bundle" => bundle. id. to_string( ) ) ) ,
380
389
opctx : opctx. child ( std:: collections:: BTreeMap :: new ( ) ) ,
381
390
request : request. clone ( ) ,
@@ -419,6 +428,7 @@ impl SupportBundleCollector {
419
428
// Wraps up all arguments to perform a single support bundle collection
420
429
struct BundleCollection {
421
430
datastore : Arc < DataStore > ,
431
+ resolver : Resolver ,
422
432
log : slog:: Logger ,
423
433
opctx : OpContext ,
424
434
request : BundleRequest ,
@@ -558,6 +568,13 @@ impl BundleCollection {
558
568
)
559
569
. await ?;
560
570
571
+ let sp_dumps_dir = dir. path ( ) . join ( "sp_task_dumps" ) ;
572
+ tokio:: fs:: create_dir_all ( & sp_dumps_dir) . await . with_context ( || {
573
+ format ! ( "failed to create SP task dump directory {sp_dumps_dir}" )
574
+ } ) ?;
575
+ let sp_dumps_fut =
576
+ save_all_sp_dumps ( log, & self . resolver , & sp_dumps_dir) ;
577
+
561
578
if let Ok ( all_sleds) = self
562
579
. datastore
563
580
. sled_list_all_batched ( & self . opctx , SledFilter :: InService )
@@ -605,6 +622,15 @@ impl BundleCollection {
605
622
}
606
623
}
607
624
625
+ let sp_dumps_dir = dir. path ( ) . join ( "sp_task_dumps" ) ;
626
+ tokio:: fs:: create_dir_all ( & sp_dumps_dir) . await . with_context ( || {
627
+ format ! ( "failed to create SP task dump directory {sp_dumps_dir}" )
628
+ } ) ?;
629
+
630
+ if let Err ( e) = sp_dumps_fut. await {
631
+ error ! ( log, "failed to capture SP task dumps" ; "error" => InlineErrorChain :: new( e. as_ref( ) ) ) ;
632
+ } ;
633
+
608
634
Ok ( report)
609
635
}
610
636
@@ -981,6 +1007,85 @@ where
981
1007
Ok ( ( ) )
982
1008
}
983
1009
1010
+ /// Collect task dumps from all SPs via MGS and save them to a directory.
1011
+ async fn save_all_sp_dumps (
1012
+ log : & slog:: Logger ,
1013
+ resolver : & Resolver ,
1014
+ sp_dumps_dir : & Utf8Path ,
1015
+ ) -> anyhow:: Result < ( ) > {
1016
+ let mgs_client = resolver
1017
+ . lookup_socket_v6 ( ServiceName :: ManagementGatewayService )
1018
+ . await
1019
+ . map ( |sockaddr| {
1020
+ let url = format ! ( "http://{}" , sockaddr) ;
1021
+ gateway_client:: Client :: new ( & url, log. clone ( ) )
1022
+ } )
1023
+ . context ( "failed to resolve address of MGS" ) ?;
1024
+
1025
+ let all_sps = mgs_client
1026
+ . sp_all_ids ( )
1027
+ . await
1028
+ . context ( "failed to get list of SPs from MGS" ) ?
1029
+ . into_inner ( ) ;
1030
+
1031
+ let mut futures = futures:: stream:: iter ( all_sps. into_iter ( ) )
1032
+ . map ( |sp| {
1033
+ let mgs_client = mgs_client. clone ( ) ;
1034
+
1035
+ async move {
1036
+ save_sp_dumps ( mgs_client, sp, & sp_dumps_dir)
1037
+ . await
1038
+ . with_context ( || format ! ( "SP {} {}" , sp. type_, sp. slot) )
1039
+ }
1040
+ } )
1041
+ . buffer_unordered ( 10 ) ;
1042
+
1043
+ while let Some ( result) = futures. next ( ) . await {
1044
+ if let Err ( e) = result {
1045
+ error ! (
1046
+ log,
1047
+ "failed to capture task dumps" ;
1048
+ "error" => InlineErrorChain :: new( e. as_ref( ) )
1049
+ ) ;
1050
+ }
1051
+ }
1052
+
1053
+ Ok ( ( ) )
1054
+ }
1055
+
1056
+ /// Fetch and save task dumps from a single SP.
1057
+ async fn save_sp_dumps (
1058
+ mgs_client : MgsClient ,
1059
+ sp : SpIdentifier ,
1060
+ sp_dumps_dir : & Utf8Path ,
1061
+ ) -> anyhow:: Result < ( ) > {
1062
+ let dump_count = mgs_client
1063
+ . sp_task_dump_count ( sp. type_ , sp. slot )
1064
+ . await
1065
+ . context ( "failed to get task dump count from SP" ) ?
1066
+ . into_inner ( ) ;
1067
+
1068
+ let output_dir = sp_dumps_dir. join ( format ! ( "{}_{}" , sp. type_, sp. slot) ) ;
1069
+ tokio:: fs:: create_dir_all ( & output_dir) . await ?;
1070
+
1071
+ for i in 0 ..dump_count {
1072
+ let task_dump = mgs_client
1073
+ . sp_task_dump_get ( sp. type_ , sp. slot , i)
1074
+ . await
1075
+ . with_context ( || format ! ( "failed to get task dump {i} from SP" ) ) ?
1076
+ . into_inner ( ) ;
1077
+
1078
+ let zip_bytes = base64:: engine:: general_purpose:: STANDARD
1079
+ . decode ( task_dump. base64_zip )
1080
+ . context ( "failed to decode base64-encoded SP task dump zip" ) ?;
1081
+
1082
+ tokio:: fs:: write ( output_dir. join ( format ! ( "dump-{i}.zip" ) ) , zip_bytes)
1083
+ . await
1084
+ . context ( "failed to write SP task dump zip to disk" ) ?;
1085
+ }
1086
+ Ok ( ( ) )
1087
+ }
1088
+
984
1089
#[ cfg( test) ]
985
1090
mod test {
986
1091
use super :: * ;
@@ -1037,12 +1142,17 @@ mod test {
1037
1142
async fn test_cleanup_noop ( cptestctx : & ControlPlaneTestContext ) {
1038
1143
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1039
1144
let datastore = nexus. datastore ( ) ;
1145
+ let resolver = nexus. resolver ( ) ;
1040
1146
let opctx = OpContext :: for_tests (
1041
1147
cptestctx. logctx . log . clone ( ) ,
1042
1148
datastore. clone ( ) ,
1043
1149
) ;
1044
- let collector =
1045
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1150
+ let collector = SupportBundleCollector :: new (
1151
+ datastore. clone ( ) ,
1152
+ resolver. clone ( ) ,
1153
+ false ,
1154
+ nexus. id ( ) ,
1155
+ ) ;
1046
1156
1047
1157
let report = collector
1048
1158
. cleanup_destroyed_bundles ( & opctx)
@@ -1058,12 +1168,17 @@ mod test {
1058
1168
async fn test_collect_noop ( cptestctx : & ControlPlaneTestContext ) {
1059
1169
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1060
1170
let datastore = nexus. datastore ( ) ;
1171
+ let resolver = nexus. resolver ( ) ;
1061
1172
let opctx = OpContext :: for_tests (
1062
1173
cptestctx. logctx . log . clone ( ) ,
1063
1174
datastore. clone ( ) ,
1064
1175
) ;
1065
- let collector =
1066
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1176
+ let collector = SupportBundleCollector :: new (
1177
+ datastore. clone ( ) ,
1178
+ resolver. clone ( ) ,
1179
+ false ,
1180
+ nexus. id ( ) ,
1181
+ ) ;
1067
1182
1068
1183
let request = BundleRequest :: default ( ) ;
1069
1184
let report = collector
@@ -1224,6 +1339,7 @@ mod test {
1224
1339
async fn test_collect_one ( cptestctx : & ControlPlaneTestContext ) {
1225
1340
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1226
1341
let datastore = nexus. datastore ( ) ;
1342
+ let resolver = nexus. resolver ( ) ;
1227
1343
let opctx = OpContext :: for_tests (
1228
1344
cptestctx. logctx . log . clone ( ) ,
1229
1345
datastore. clone ( ) ,
@@ -1242,8 +1358,12 @@ mod test {
1242
1358
. expect ( "Couldn't allocate a support bundle" ) ;
1243
1359
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1244
1360
1245
- let collector =
1246
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1361
+ let collector = SupportBundleCollector :: new (
1362
+ datastore. clone ( ) ,
1363
+ resolver. clone ( ) ,
1364
+ false ,
1365
+ nexus. id ( ) ,
1366
+ ) ;
1247
1367
1248
1368
// The bundle collection should complete successfully.
1249
1369
let request = BundleRequest {
@@ -1279,6 +1399,7 @@ mod test {
1279
1399
async fn test_collect_many ( cptestctx : & ControlPlaneTestContext ) {
1280
1400
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1281
1401
let datastore = nexus. datastore ( ) ;
1402
+ let resolver = nexus. resolver ( ) ;
1282
1403
let opctx = OpContext :: for_tests (
1283
1404
cptestctx. logctx . log . clone ( ) ,
1284
1405
datastore. clone ( ) ,
@@ -1299,8 +1420,12 @@ mod test {
1299
1420
. await
1300
1421
. expect ( "Couldn't allocate a second support bundle" ) ;
1301
1422
1302
- let collector =
1303
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1423
+ let collector = SupportBundleCollector :: new (
1424
+ datastore. clone ( ) ,
1425
+ resolver. clone ( ) ,
1426
+ false ,
1427
+ nexus. id ( ) ,
1428
+ ) ;
1304
1429
1305
1430
// Each time we call "collect_bundle", we collect a SINGLE bundle.
1306
1431
let request = BundleRequest { skip_sled_info : true } ;
@@ -1355,6 +1480,7 @@ mod test {
1355
1480
) {
1356
1481
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1357
1482
let datastore = nexus. datastore ( ) ;
1483
+ let resolver = nexus. resolver ( ) ;
1358
1484
let opctx = OpContext :: for_tests (
1359
1485
cptestctx. logctx . log . clone ( ) ,
1360
1486
datastore. clone ( ) ,
@@ -1384,8 +1510,12 @@ mod test {
1384
1510
. await
1385
1511
. unwrap ( ) ;
1386
1512
1387
- let collector =
1388
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1513
+ let collector = SupportBundleCollector :: new (
1514
+ datastore. clone ( ) ,
1515
+ resolver. clone ( ) ,
1516
+ false ,
1517
+ nexus. id ( ) ,
1518
+ ) ;
1389
1519
1390
1520
let report = collector
1391
1521
. cleanup_destroyed_bundles ( & opctx)
@@ -1410,6 +1540,7 @@ mod test {
1410
1540
) {
1411
1541
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1412
1542
let datastore = nexus. datastore ( ) ;
1543
+ let resolver = nexus. resolver ( ) ;
1413
1544
let opctx = OpContext :: for_tests (
1414
1545
cptestctx. logctx . log . clone ( ) ,
1415
1546
datastore. clone ( ) ,
@@ -1427,8 +1558,12 @@ mod test {
1427
1558
. expect ( "Couldn't allocate a support bundle" ) ;
1428
1559
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1429
1560
1430
- let collector =
1431
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1561
+ let collector = SupportBundleCollector :: new (
1562
+ datastore. clone ( ) ,
1563
+ resolver. clone ( ) ,
1564
+ false ,
1565
+ nexus. id ( ) ,
1566
+ ) ;
1432
1567
let request = BundleRequest { skip_sled_info : true } ;
1433
1568
let report = collector
1434
1569
. collect_bundle ( & opctx, & request)
@@ -1475,6 +1610,7 @@ mod test {
1475
1610
) {
1476
1611
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1477
1612
let datastore = nexus. datastore ( ) ;
1613
+ let resolver = nexus. resolver ( ) ;
1478
1614
let opctx = OpContext :: for_tests (
1479
1615
cptestctx. logctx . log . clone ( ) ,
1480
1616
datastore. clone ( ) ,
@@ -1506,8 +1642,12 @@ mod test {
1506
1642
. await
1507
1643
. unwrap ( ) ;
1508
1644
1509
- let collector =
1510
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1645
+ let collector = SupportBundleCollector :: new (
1646
+ datastore. clone ( ) ,
1647
+ resolver. clone ( ) ,
1648
+ false ,
1649
+ nexus. id ( ) ,
1650
+ ) ;
1511
1651
1512
1652
let report = collector
1513
1653
. cleanup_destroyed_bundles ( & opctx)
@@ -1535,6 +1675,7 @@ mod test {
1535
1675
) {
1536
1676
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1537
1677
let datastore = nexus. datastore ( ) ;
1678
+ let resolver = nexus. resolver ( ) ;
1538
1679
let opctx = OpContext :: for_tests (
1539
1680
cptestctx. logctx . log . clone ( ) ,
1540
1681
datastore. clone ( ) ,
@@ -1552,8 +1693,12 @@ mod test {
1552
1693
. expect ( "Couldn't allocate a support bundle" ) ;
1553
1694
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1554
1695
1555
- let collector =
1556
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1696
+ let collector = SupportBundleCollector :: new (
1697
+ datastore. clone ( ) ,
1698
+ resolver. clone ( ) ,
1699
+ false ,
1700
+ nexus. id ( ) ,
1701
+ ) ;
1557
1702
let request = BundleRequest { skip_sled_info : true } ;
1558
1703
let report = collector
1559
1704
. collect_bundle ( & opctx, & request)
@@ -1609,6 +1754,7 @@ mod test {
1609
1754
) {
1610
1755
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1611
1756
let datastore = nexus. datastore ( ) ;
1757
+ let resolver = nexus. resolver ( ) ;
1612
1758
let opctx = OpContext :: for_tests (
1613
1759
cptestctx. logctx . log . clone ( ) ,
1614
1760
datastore. clone ( ) ,
@@ -1626,8 +1772,12 @@ mod test {
1626
1772
. expect ( "Couldn't allocate a support bundle" ) ;
1627
1773
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1628
1774
1629
- let collector =
1630
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1775
+ let collector = SupportBundleCollector :: new (
1776
+ datastore. clone ( ) ,
1777
+ resolver. clone ( ) ,
1778
+ false ,
1779
+ nexus. id ( ) ,
1780
+ ) ;
1631
1781
let request = BundleRequest { skip_sled_info : true } ;
1632
1782
let report = collector
1633
1783
. collect_bundle ( & opctx, & request)
0 commit comments