From 9de97c95cc3ded0700bdd649db7cab3b3568179d Mon Sep 17 00:00:00 2001 From: Jobdori Date: Sat, 4 Apr 2026 20:07:36 +0900 Subject: [PATCH] feat(recovery): bridge WorkerFailureKind to FailureScenario (P2.8/P2.13) Connect worker_boot failure classification to recovery_recipes policy: - Add FailureScenario::ProviderFailure variant - Add FailureScenario::from_worker_failure_kind() bridge function mapping every WorkerFailureKind to a concrete FailureScenario - Add RecoveryStep::RestartWorker for provider failure recovery - Add recipe for ProviderFailure: RestartWorker -> AlertHuman escalation - 3 new tests: bridge mapping, recipe structure, recovery attempt cycle Previously a claw that detected WorkerFailureKind::Provider had no machine-readable path to 'what should I do about this?'. Now it can call from_worker_failure_kind() -> recipe_for() -> attempt_recovery() as a single structured chain. Closes the silo between worker_boot and recovery_recipes. --- rust/crates/runtime/src/recovery_recipes.rs | 76 +++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/rust/crates/runtime/src/recovery_recipes.rs b/rust/crates/runtime/src/recovery_recipes.rs index 0afee62..63fe92c 100644 --- a/rust/crates/runtime/src/recovery_recipes.rs +++ b/rust/crates/runtime/src/recovery_recipes.rs @@ -9,6 +9,8 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use crate::worker_boot::WorkerFailureKind; + /// The six failure scenarios that have known recovery recipes. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] @@ -19,6 +21,7 @@ pub enum FailureScenario { CompileRedCrossCrate, McpHandshakeFailure, PartialPluginStartup, + ProviderFailure, } impl FailureScenario { @@ -32,8 +35,21 @@ impl FailureScenario { Self::CompileRedCrossCrate, Self::McpHandshakeFailure, Self::PartialPluginStartup, + Self::ProviderFailure, ] } + + /// Map a `WorkerFailureKind` to the corresponding `FailureScenario`. + /// This is the bridge that lets recovery policy consume worker boot events. + #[must_use] + pub fn from_worker_failure_kind(kind: WorkerFailureKind) -> Self { + match kind { + WorkerFailureKind::TrustGate => Self::TrustPromptUnresolved, + WorkerFailureKind::PromptDelivery => Self::PromptMisdelivery, + WorkerFailureKind::Protocol => Self::McpHandshakeFailure, + WorkerFailureKind::Provider => Self::ProviderFailure, + } + } } impl std::fmt::Display for FailureScenario { @@ -45,6 +61,7 @@ impl std::fmt::Display for FailureScenario { Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"), Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"), Self::PartialPluginStartup => write!(f, "partial_plugin_startup"), + Self::ProviderFailure => write!(f, "provider_failure"), } } } @@ -59,6 +76,7 @@ pub enum RecoveryStep { CleanBuild, RetryMcpHandshake { timeout: u64 }, RestartPlugin { name: String }, + RestartWorker, EscalateToHuman { reason: String }, } @@ -196,6 +214,12 @@ pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe { max_attempts: 1, escalation_policy: EscalationPolicy::LogAndContinue, }, + FailureScenario::ProviderFailure => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::RestartWorker], + max_attempts: 1, + escalation_policy: EscalationPolicy::AlertHuman, + }, } } @@ -551,4 +575,56 @@ mod tests { assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort); assert_eq!(recipe.max_attempts, 1); } + + #[test] + fn worker_failure_kind_maps_to_failure_scenario() { + // given / when / then — verify the bridge is correct + assert_eq!( + FailureScenario::from_worker_failure_kind(WorkerFailureKind::TrustGate), + FailureScenario::TrustPromptUnresolved, + ); + assert_eq!( + FailureScenario::from_worker_failure_kind(WorkerFailureKind::PromptDelivery), + FailureScenario::PromptMisdelivery, + ); + assert_eq!( + FailureScenario::from_worker_failure_kind(WorkerFailureKind::Protocol), + FailureScenario::McpHandshakeFailure, + ); + assert_eq!( + FailureScenario::from_worker_failure_kind(WorkerFailureKind::Provider), + FailureScenario::ProviderFailure, + ); + } + + #[test] + fn provider_failure_recipe_uses_restart_worker_step() { + // given + let recipe = recipe_for(&FailureScenario::ProviderFailure); + + // then + assert_eq!(recipe.scenario, FailureScenario::ProviderFailure); + assert!(recipe.steps.contains(&RecoveryStep::RestartWorker)); + assert_eq!(recipe.escalation_policy, EscalationPolicy::AlertHuman); + assert_eq!(recipe.max_attempts, 1); + } + + #[test] + fn provider_failure_recovery_attempt_succeeds_then_escalates() { + // given + let mut ctx = RecoveryContext::new(); + let scenario = FailureScenario::ProviderFailure; + + // when — first attempt + let first = attempt_recovery(&scenario, &mut ctx); + assert!(matches!(first, RecoveryResult::Recovered { .. })); + + // when — second attempt should escalate (max_attempts=1) + let second = attempt_recovery(&scenario, &mut ctx); + assert!(matches!(second, RecoveryResult::EscalationRequired { .. })); + assert!(ctx + .events() + .iter() + .any(|e| matches!(e, RecoveryEvent::Escalated))); + } }