Skip to content

Commit bc2c44b

Browse files
committed
Add an RPC endpoint to report scout firmware upgrade status
1 parent 9e96e8e commit bc2c44b

File tree

6 files changed

+115
-3
lines changed

6 files changed

+115
-3
lines changed

crates/api-model/src/machine/mod.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,6 +1576,22 @@ pub enum HostReprovisionState {
15761576
reason: Option<String>,
15771577
},
15781578
WaitingForRackFirmwareUpgrade,
1579+
WaitingForScoutUpgrade {
1580+
component_type: FirmwareComponentType,
1581+
target_version: String,
1582+
started_at: DateTime<Utc>,
1583+
#[serde(default)]
1584+
result: Option<ScoutUpgradeResult>,
1585+
},
1586+
}
1587+
1588+
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
1589+
pub struct ScoutUpgradeResult {
1590+
pub success: bool,
1591+
pub exit_code: i32,
1592+
pub stdout: String,
1593+
pub stderr: String,
1594+
pub error: String,
15791595
}
15801596

15811597
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]

crates/api/src/api.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1145,6 +1145,14 @@ impl Forge for Api {
11451145
.await
11461146
}
11471147

1148+
async fn report_scout_firmware_upgrade_status(
1149+
&self,
1150+
request: Request<rpc::ScoutFirmwareUpgradeStatusRequest>,
1151+
) -> Result<Response<()>, Status> {
1152+
crate::handlers::host_reprovisioning::report_scout_firmware_upgrade_status(self, request)
1153+
.await
1154+
}
1155+
11481156
async fn list_hosts_waiting_for_reprovisioning(
11491157
&self,
11501158
request: Request<rpc::HostReprovisioningListRequest>,
@@ -3253,7 +3261,7 @@ pub(crate) fn log_tenant_organization_id(organization_id: &str) {
32533261
tracing::Span::current().record("tenant.organization_id", organization_id);
32543262
}
32553263

3256-
fn truncate(mut s: String, len: usize) -> String {
3264+
pub(crate) fn truncate(mut s: String, len: usize) -> String {
32573265
if s.len() < len || len < 3 {
32583266
return s;
32593267
}

crates/api/src/auth/internal_rbac_rules.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ impl InternalRBACRules {
168168
x.perm("DiscoveryCompleted", vec![Machineatron, Scout]);
169169
x.perm("CleanupMachineCompleted", vec![Machineatron, Scout]);
170170
x.perm("ReportForgeScoutError", vec![Scout]);
171+
x.perm("ReportScoutFirmwareUpgradeStatus", vec![Scout]);
171172
x.perm("DiscoverDhcp", vec![Dhcp, Machineatron]);
172173
x.perm("ExpireDhcpLease", vec![Dhcp, Machineatron]);
173174
x.perm("AssignStaticAddress", vec![ForgeAdminCLI]);

crates/api/src/handlers/host_reprovisioning.rs

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
use ::rpc::forge as rpc;
1818
use carbide_uuid::machine::MachineId;
1919
use itertools::Itertools;
20-
use model::machine::LoadSnapshotOptions;
20+
use model::machine::{
21+
HostReprovisionState, LoadSnapshotOptions, ManagedHostState, ScoutUpgradeResult,
22+
};
2123
use tonic::{Request, Response, Status};
2224

2325
use crate::CarbideError;
24-
use crate::api::{Api, log_request_data};
26+
use crate::api::{Api, log_request_data, truncate};
2527
use crate::handlers::utils::convert_and_log_machine_id;
2628

2729
pub(crate) async fn reset_host_reprovisioning(
@@ -144,3 +146,72 @@ pub async fn mark_manual_firmware_upgrade_complete(
144146

145147
Ok(Response::new(()))
146148
}
149+
150+
pub async fn report_scout_firmware_upgrade_status(
151+
api: &Api,
152+
request: Request<rpc::ScoutFirmwareUpgradeStatusRequest>,
153+
) -> Result<Response<()>, Status> {
154+
log_request_data(&request);
155+
156+
let req = request.into_inner();
157+
let machine_id = convert_and_log_machine_id(req.machine_id.as_ref())?;
158+
159+
let (machine, mut txn) = api.load_machine(&machine_id, Default::default()).await?;
160+
161+
// Verify machine is in WaitingForScoutUpgrade state
162+
let (component_type, target_version, started_at, retry_count) = match machine.current_state() {
163+
ManagedHostState::HostReprovision {
164+
reprovision_state:
165+
HostReprovisionState::WaitingForScoutUpgrade {
166+
component_type,
167+
target_version,
168+
started_at,
169+
..
170+
},
171+
retry_count,
172+
} => (
173+
component_type.clone(),
174+
target_version.clone(),
175+
*started_at,
176+
*retry_count,
177+
),
178+
_ => {
179+
return Err(CarbideError::InvalidArgument(format!(
180+
"Machine {machine_id} is not in WaitingForScoutUpgrade state"
181+
))
182+
.into());
183+
}
184+
};
185+
186+
const MAX_STORED_OUTPUT_SIZE: usize = 1500;
187+
188+
let new_state = ManagedHostState::HostReprovision {
189+
reprovision_state: HostReprovisionState::WaitingForScoutUpgrade {
190+
component_type,
191+
target_version,
192+
started_at,
193+
result: Some(ScoutUpgradeResult {
194+
success: req.success,
195+
exit_code: req.exit_code,
196+
stdout: truncate(req.stdout, MAX_STORED_OUTPUT_SIZE),
197+
stderr: truncate(req.stderr, MAX_STORED_OUTPUT_SIZE),
198+
error: truncate(req.error, MAX_STORED_OUTPUT_SIZE),
199+
}),
200+
},
201+
retry_count,
202+
};
203+
204+
db::machine::advance(&machine, &mut txn, &new_state, None).await?;
205+
206+
txn.commit().await?;
207+
208+
if let Err(err) = api
209+
.machine_state_handler_enqueuer
210+
.enqueue_object(&machine_id)
211+
.await
212+
{
213+
tracing::warn!(%err, %machine_id, "Failed to wake up state handler for machine");
214+
}
215+
216+
Ok(Response::new(()))
217+
}

crates/api/src/state_controller/machine/handler.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6653,6 +6653,10 @@ impl HostUpgradeState {
66536653
self.host_new_firmware_reported_wait(state, ctx, details, machine_id, scenario)
66546654
.await
66556655
}
6656+
HostReprovisionState::WaitingForScoutUpgrade { .. } => {
6657+
// TODO: will be implemented in a follow-up (@jrakhmonov)
6658+
Ok(StateHandlerOutcome::do_nothing())
6659+
}
66566660
HostReprovisionState::FailedFirmwareUpgrade { report_time, .. } => {
66576661
let can_retry = retry_count < MAX_FIRMWARE_UPGRADE_RETRIES;
66586662
let waited_enough = Utc::now()

crates/rpc/proto/forge.proto

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,8 @@ service Forge {
315315
// TODO: Remove when manual upgrade feature is removed
316316
// Mark host as having completed manual firmware upgrade
317317
rpc MarkManualFirmwareUpgradeComplete(common.MachineId) returns (google.protobuf.Empty);
318+
// Report the result of a scout-based firmware upgrade
319+
rpc ReportScoutFirmwareUpgradeStatus(ScoutFirmwareUpgradeStatusRequest) returns (google.protobuf.Empty);
318320

319321
rpc GetDpuInfoList(GetDpuInfoListRequest) returns (GetDpuInfoListResponse);
320322

@@ -5162,6 +5164,16 @@ message MachineRebootCompletedResponse {
51625164
message MachineRebootCompletedRequest {
51635165
common.MachineId machine_id = 1;
51645166
}
5167+
5168+
message ScoutFirmwareUpgradeStatusRequest {
5169+
common.MachineId machine_id = 1;
5170+
bool success = 2;
5171+
int32 exit_code = 3;
5172+
string stdout = 4;
5173+
string stderr = 5;
5174+
string error = 6;
5175+
}
5176+
51655177
// enum MachineValidationState {
51665178
// Started = 0;
51675179
// InProgress = 1;

0 commit comments

Comments
 (0)