diff --git a/Cargo.lock b/Cargo.lock index 19fbb8b8..10bbe47f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -451,7 +451,7 @@ checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" [[package]] name = "cli-sub-agent" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -610,7 +610,7 @@ dependencies = [ [[package]] name = "csa-acp" -version = "0.1.113" +version = "0.1.114" dependencies = [ "agent-client-protocol", "anyhow", @@ -629,7 +629,7 @@ dependencies = [ [[package]] name = "csa-config" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -645,7 +645,7 @@ dependencies = [ [[package]] name = "csa-core" -version = "0.1.113" +version = "0.1.114" dependencies = [ "agent-teams", "chrono", @@ -659,7 +659,7 @@ dependencies = [ [[package]] name = "csa-executor" -version = "0.1.113" +version = "0.1.114" dependencies = [ "agent-teams", "anyhow", @@ -684,7 +684,7 @@ dependencies = [ [[package]] name = "csa-hooks" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -699,7 +699,7 @@ dependencies = [ [[package]] name = "csa-lock" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -711,7 +711,7 @@ dependencies = [ [[package]] name = "csa-mcp-hub" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "axum", @@ -733,7 +733,7 @@ dependencies = [ [[package]] name = "csa-memory" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "async-trait", @@ -751,7 +751,7 @@ dependencies = [ [[package]] name = "csa-process" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "csa-core", @@ -768,7 +768,7 @@ dependencies = [ [[package]] name = "csa-resource" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "csa-core", @@ -783,7 +783,7 @@ dependencies = [ [[package]] name = "csa-scheduler" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -801,7 +801,7 @@ dependencies = [ [[package]] name = "csa-session" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -822,7 +822,7 @@ dependencies = [ [[package]] name = "csa-todo" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "chrono", @@ -4090,7 +4090,7 @@ dependencies = [ [[package]] name = "weave" -version = "0.1.113" +version = "0.1.114" dependencies = [ "anyhow", "clap", diff --git a/Cargo.toml b/Cargo.toml index f399a1ba..74368f88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "2" [workspace.package] -version = "0.1.113" +version = "0.1.114" edition = "2024" rust-version = "1.85" license = "Apache-2.0" diff --git a/crates/cli-sub-agent/src/debate_cmd.rs b/crates/cli-sub-agent/src/debate_cmd.rs index 64d20b03..537ff16e 100644 --- a/crates/cli-sub-agent/src/debate_cmd.rs +++ b/crates/cli-sub-agent/src/debate_cmd.rs @@ -167,8 +167,19 @@ pub(crate) async fn handle_debate( ) .await?; - // 7. Get env injection from global config - let extra_env = global_config.env_vars(executor.tool_name()); + // 7. Get env injection from global config (with no-flash + api key fallback) + let extra_env_owned = { + let mut env = global_config + .env_vars(executor.tool_name()) + .cloned() + .unwrap_or_default(); + env.insert("_CSA_NO_FLASH_FALLBACK".to_string(), "1".to_string()); + if let Some(key) = global_config.api_key_fallback(executor.tool_name()) { + env.insert("_CSA_API_KEY_FALLBACK".to_string(), key.to_string()); + } + env + }; + let extra_env = Some(&extra_env_owned); let idle_timeout_seconds = crate::pipeline::resolve_idle_timeout_seconds(config.as_ref(), args.idle_timeout); diff --git a/crates/cli-sub-agent/src/review_cmd.rs b/crates/cli-sub-agent/src/review_cmd.rs index a357df01..e2c8b41c 100644 --- a/crates/cli-sub-agent/src/review_cmd.rs +++ b/crates/cli-sub-agent/src/review_cmd.rs @@ -537,7 +537,18 @@ async fn execute_review( prompt }; - let extra_env = global_config.env_vars(executor.tool_name()); + let extra_env_owned = { + let mut env = global_config + .env_vars(executor.tool_name()) + .cloned() + .unwrap_or_default(); + env.insert("_CSA_NO_FLASH_FALLBACK".to_string(), "1".to_string()); + if let Some(key) = global_config.api_key_fallback(executor.tool_name()) { + env.insert("_CSA_API_KEY_FALLBACK".to_string(), key.to_string()); + } + env + }; + let extra_env = Some(&extra_env_owned); let _slot_guard = crate::pipeline::acquire_slot(&executor, global_config)?; if session.is_none() { diff --git a/crates/csa-executor/src/transport.rs b/crates/csa-executor/src/transport.rs index dda4e892..3c689804 100644 --- a/crates/csa-executor/src/transport.rs +++ b/crates/csa-executor/src/transport.rs @@ -69,7 +69,9 @@ pub struct LegacyTransport { } const GEMINI_RATE_LIMIT_MAX_ATTEMPTS: u8 = 3; +const GEMINI_RATE_LIMIT_NO_FLASH_ATTEMPTS: u8 = 2; const API_KEY_FALLBACK_ENV_KEY: &str = "_CSA_API_KEY_FALLBACK"; +const NO_FLASH_FALLBACK_ENV_KEY: &str = "_CSA_NO_FLASH_FALLBACK"; const GEMINI_API_KEY_ENV: &str = "GEMINI_API_KEY"; #[cfg(test)] const GEMINI_RATE_LIMIT_BASE_BACKOFF_MS: u64 = 10; @@ -96,9 +98,15 @@ impl LegacyTransport { &self, execution: &ExecutionResult, attempt: u8, + extra_env: Option<&HashMap>, ) -> Option { + let max = if Self::is_no_flash(extra_env) { + GEMINI_RATE_LIMIT_NO_FLASH_ATTEMPTS + } else { + GEMINI_RATE_LIMIT_MAX_ATTEMPTS + }; if !matches!(self.executor, Executor::GeminiCli { .. }) - || attempt >= GEMINI_RATE_LIMIT_MAX_ATTEMPTS + || attempt >= max || !Self::is_gemini_rate_limited(execution) { return None; @@ -106,6 +114,10 @@ impl LegacyTransport { Some(Self::gemini_rate_limit_backoff(attempt)) } + fn is_no_flash(extra_env: Option<&HashMap>) -> bool { + extra_env.is_some_and(|env| env.contains_key(NO_FLASH_FALLBACK_ENV_KEY)) + } + fn is_gemini_rate_limited(execution: &ExecutionResult) -> bool { if execution.exit_code == 0 { return false; @@ -294,13 +306,10 @@ impl LegacyTransport { idle_timeout_seconds, ) .await?; - if let Some(backoff) = self.should_retry_gemini_rate_limited(&result.execution, attempt) + if let Some(backoff) = + self.should_retry_gemini_rate_limited(&result.execution, attempt, extra_env) { - tracing::debug!( - attempt, - max_attempts = GEMINI_RATE_LIMIT_MAX_ATTEMPTS, - "gemini-cli rate limit; retrying with model switch" - ); + tracing::debug!(attempt, "gemini-cli rate limit; retrying with model switch"); tokio::time::sleep(backoff).await; attempt = attempt.saturating_add(1); continue; @@ -308,9 +317,7 @@ impl LegacyTransport { // API key fallback: all model retries exhausted, still quota error. if Self::is_gemini_rate_limited(&result.execution) { if let Some(env_with_key) = Self::inject_api_key_fallback(extra_env) { - tracing::info!( - "gemini-cli quota exhausted after all retries; falling back to API key auth" - ); + tracing::info!("gemini-cli quota exhausted; falling back to API key auth"); return self .execute_in_single_attempt( &self.executor, @@ -351,13 +358,10 @@ impl Transport for LegacyTransport { options.clone(), ) .await?; - if let Some(backoff) = self.should_retry_gemini_rate_limited(&result.execution, attempt) + if let Some(backoff) = + self.should_retry_gemini_rate_limited(&result.execution, attempt, extra_env) { - tracing::debug!( - attempt, - max_attempts = GEMINI_RATE_LIMIT_MAX_ATTEMPTS, - "gemini-cli rate limit; retrying with model switch" - ); + tracing::debug!(attempt, "gemini-cli rate limit; retrying with model switch"); tokio::time::sleep(backoff).await; attempt = attempt.saturating_add(1); continue; @@ -365,9 +369,7 @@ impl Transport for LegacyTransport { // API key fallback: all model retries exhausted, still quota error. if Self::is_gemini_rate_limited(&result.execution) { if let Some(env_with_key) = Self::inject_api_key_fallback(extra_env) { - tracing::info!( - "gemini-cli quota exhausted after all retries; falling back to API key auth" - ); + tracing::info!("gemini-cli quota exhausted; falling back to API key auth"); return self .execute_single_attempt( &self.executor, diff --git a/crates/csa-executor/src/transport_tests_tail.rs b/crates/csa-executor/src/transport_tests_tail.rs index 690f01e7..e01fe074 100644 --- a/crates/csa-executor/src/transport_tests_tail.rs +++ b/crates/csa-executor/src/transport_tests_tail.rs @@ -378,9 +378,9 @@ fn test_should_retry_gemini_rate_limited_until_final_attempt() { exit_code: 1, }; - assert!(transport.should_retry_gemini_rate_limited(&execution, 1).is_some()); - assert!(transport.should_retry_gemini_rate_limited(&execution, 2).is_some()); - assert!(transport.should_retry_gemini_rate_limited(&execution, 3).is_none()); + assert!(transport.should_retry_gemini_rate_limited(&execution, 1, None).is_some()); + assert!(transport.should_retry_gemini_rate_limited(&execution, 2, None).is_some()); + assert!(transport.should_retry_gemini_rate_limited(&execution, 3, None).is_none()); } #[test] @@ -395,7 +395,7 @@ fn test_should_not_retry_on_success_exit_code() { stderr_output: String::new(), exit_code: 0, }; - assert!(transport.should_retry_gemini_rate_limited(&execution, 1).is_none()); + assert!(transport.should_retry_gemini_rate_limited(&execution, 1, None).is_none()); } #[test] @@ -410,7 +410,29 @@ fn test_should_retry_on_quota_exhausted_marker() { stderr_output: "reason: 'QUOTA_EXHAUSTED'".to_string(), exit_code: 1, }; - assert!(transport.should_retry_gemini_rate_limited(&execution, 1).is_some()); + assert!(transport.should_retry_gemini_rate_limited(&execution, 1, None).is_some()); +} + +#[test] +fn test_no_flash_fallback_stops_retry_after_attempt_2() { + let transport = LegacyTransport::new(Executor::GeminiCli { + model_override: None, + thinking_budget: None, + }); + let execution = ExecutionResult { + summary: "failed".to_string(), + output: String::new(), + stderr_output: "HTTP 429 Too Many Requests".to_string(), + exit_code: 1, + }; + let mut env = HashMap::new(); + env.insert("_CSA_NO_FLASH_FALLBACK".to_string(), "1".to_string()); + // Attempt 1 retries (switches to pro) + assert!(transport.should_retry_gemini_rate_limited(&execution, 1, Some(&env)).is_some()); + // Attempt 2 does NOT retry (would switch to flash, which is forbidden) + assert!(transport.should_retry_gemini_rate_limited(&execution, 2, Some(&env)).is_none()); + // Without the flag, attempt 2 would still retry + assert!(transport.should_retry_gemini_rate_limited(&execution, 2, None).is_some()); } #[test]