feat(runtime): recovery recipes

This commit is contained in:
Jobdori
2026-04-04 00:42:39 +09:00
parent f76311f9d6
commit 2787981632
2 changed files with 559 additions and 0 deletions

View File

@@ -14,6 +14,7 @@ mod mcp_stdio;
pub mod mcp_tool_bridge; pub mod mcp_tool_bridge;
mod oauth; mod oauth;
pub mod permission_enforcer; pub mod permission_enforcer;
pub mod recovery_recipes;
mod permissions; mod permissions;
mod prompt; mod prompt;
mod remote; mod remote;
@@ -80,6 +81,10 @@ pub use permissions::{
PermissionContext, PermissionMode, PermissionOutcome, PermissionOverride, PermissionPolicy, PermissionContext, PermissionMode, PermissionOutcome, PermissionOverride, PermissionPolicy,
PermissionPromptDecision, PermissionPrompter, PermissionRequest, PermissionPromptDecision, PermissionPrompter, PermissionRequest,
}; };
pub use recovery_recipes::{
attempt_recovery, recipe_for, EscalationPolicy, FailureScenario, RecoveryContext,
RecoveryEvent, RecoveryRecipe, RecoveryResult, RecoveryStep,
};
pub use prompt::{ pub use prompt::{
load_system_prompt, prepend_bullets, ContextFile, ProjectContext, PromptBuildError, load_system_prompt, prepend_bullets, ContextFile, ProjectContext, PromptBuildError,
SystemPromptBuilder, FRONTIER_MODEL_NAME, SYSTEM_PROMPT_DYNAMIC_BOUNDARY, SystemPromptBuilder, FRONTIER_MODEL_NAME, SYSTEM_PROMPT_DYNAMIC_BOUNDARY,

View File

@@ -0,0 +1,554 @@
//! Recovery recipes for common failure scenarios.
//!
//! Encodes known automatic recoveries for the six failure scenarios
//! listed in ROADMAP item 8, and enforces one automatic recovery
//! attempt before escalation. Each attempt is emitted as a structured
//! recovery event.
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
/// The six failure scenarios that have known recovery recipes.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FailureScenario {
TrustPromptUnresolved,
PromptMisdelivery,
StaleBranch,
CompileRedCrossCrate,
McpHandshakeFailure,
PartialPluginStartup,
}
impl FailureScenario {
/// Returns all known failure scenarios.
#[must_use]
pub fn all() -> &'static [FailureScenario] {
&[
Self::TrustPromptUnresolved,
Self::PromptMisdelivery,
Self::StaleBranch,
Self::CompileRedCrossCrate,
Self::McpHandshakeFailure,
Self::PartialPluginStartup,
]
}
}
impl std::fmt::Display for FailureScenario {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::TrustPromptUnresolved => write!(f, "trust_prompt_unresolved"),
Self::PromptMisdelivery => write!(f, "prompt_misdelivery"),
Self::StaleBranch => write!(f, "stale_branch"),
Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"),
Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"),
Self::PartialPluginStartup => write!(f, "partial_plugin_startup"),
}
}
}
/// Individual step that can be executed as part of a recovery recipe.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecoveryStep {
AcceptTrustPrompt,
RedirectPromptToAgent,
RebaseBranch,
CleanBuild,
RetryMcpHandshake { timeout: u64 },
RestartPlugin { name: String },
EscalateToHuman { reason: String },
}
/// Policy governing what happens when automatic recovery is exhausted.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum EscalationPolicy {
AlertHuman,
LogAndContinue,
Abort,
}
/// A recovery recipe encodes the sequence of steps to attempt for a
/// given failure scenario, along with the maximum number of automatic
/// attempts and the escalation policy.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct RecoveryRecipe {
pub scenario: FailureScenario,
pub steps: Vec<RecoveryStep>,
pub max_attempts: u32,
pub escalation_policy: EscalationPolicy,
}
/// Outcome of a recovery attempt.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecoveryResult {
Recovered {
steps_taken: u32,
},
PartialRecovery {
recovered: Vec<RecoveryStep>,
remaining: Vec<RecoveryStep>,
},
EscalationRequired {
reason: String,
},
}
/// Structured event emitted during recovery.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecoveryEvent {
RecoveryAttempted {
scenario: FailureScenario,
recipe: RecoveryRecipe,
result: RecoveryResult,
},
RecoverySucceeded,
RecoveryFailed,
Escalated,
}
/// Minimal context for tracking recovery state and emitting events.
///
/// Holds per-scenario attempt counts, a structured event log, and an
/// optional simulation knob for controlling step outcomes during tests.
#[derive(Debug, Clone, Default)]
pub struct RecoveryContext {
attempts: HashMap<FailureScenario, u32>,
events: Vec<RecoveryEvent>,
/// Optional step index at which simulated execution fails.
/// `None` means all steps succeed.
fail_at_step: Option<usize>,
}
impl RecoveryContext {
#[must_use]
pub fn new() -> Self {
Self::default()
}
/// Configure a step index at which simulated execution will fail.
#[must_use]
pub fn with_fail_at_step(mut self, index: usize) -> Self {
self.fail_at_step = Some(index);
self
}
/// Returns the structured event log populated during recovery.
#[must_use]
pub fn events(&self) -> &[RecoveryEvent] {
&self.events
}
/// Returns the number of recovery attempts made for a scenario.
#[must_use]
pub fn attempt_count(&self, scenario: &FailureScenario) -> u32 {
self.attempts.get(scenario).copied().unwrap_or(0)
}
}
/// Returns the known recovery recipe for the given failure scenario.
#[must_use]
pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe {
match scenario {
FailureScenario::TrustPromptUnresolved => RecoveryRecipe {
scenario: *scenario,
steps: vec![RecoveryStep::AcceptTrustPrompt],
max_attempts: 1,
escalation_policy: EscalationPolicy::AlertHuman,
},
FailureScenario::PromptMisdelivery => RecoveryRecipe {
scenario: *scenario,
steps: vec![RecoveryStep::RedirectPromptToAgent],
max_attempts: 1,
escalation_policy: EscalationPolicy::AlertHuman,
},
FailureScenario::StaleBranch => RecoveryRecipe {
scenario: *scenario,
steps: vec![RecoveryStep::RebaseBranch, RecoveryStep::CleanBuild],
max_attempts: 1,
escalation_policy: EscalationPolicy::AlertHuman,
},
FailureScenario::CompileRedCrossCrate => RecoveryRecipe {
scenario: *scenario,
steps: vec![RecoveryStep::CleanBuild],
max_attempts: 1,
escalation_policy: EscalationPolicy::AlertHuman,
},
FailureScenario::McpHandshakeFailure => RecoveryRecipe {
scenario: *scenario,
steps: vec![RecoveryStep::RetryMcpHandshake { timeout: 5000 }],
max_attempts: 1,
escalation_policy: EscalationPolicy::Abort,
},
FailureScenario::PartialPluginStartup => RecoveryRecipe {
scenario: *scenario,
steps: vec![
RecoveryStep::RestartPlugin {
name: "stalled".to_string(),
},
RecoveryStep::RetryMcpHandshake { timeout: 3000 },
],
max_attempts: 1,
escalation_policy: EscalationPolicy::LogAndContinue,
},
}
}
/// Attempts automatic recovery for the given failure scenario.
///
/// Looks up the recipe, enforces the one-attempt-before-escalation
/// policy, simulates step execution (controlled by the context), and
/// emits structured [`RecoveryEvent`]s for every attempt.
pub fn attempt_recovery(scenario: &FailureScenario, ctx: &mut RecoveryContext) -> RecoveryResult {
let recipe = recipe_for(scenario);
let attempt_count = ctx.attempts.entry(*scenario).or_insert(0);
// Enforce one automatic recovery attempt before escalation.
if *attempt_count >= recipe.max_attempts {
let result = RecoveryResult::EscalationRequired {
reason: format!(
"max recovery attempts ({}) exceeded for {}",
recipe.max_attempts, scenario
),
};
ctx.events.push(RecoveryEvent::RecoveryAttempted {
scenario: *scenario,
recipe,
result: result.clone(),
});
ctx.events.push(RecoveryEvent::Escalated);
return result;
}
*attempt_count += 1;
// Execute steps, honoring the optional fail_at_step simulation.
let fail_index = ctx.fail_at_step;
let mut executed = Vec::new();
let mut failed = false;
for (i, step) in recipe.steps.iter().enumerate() {
if fail_index == Some(i) {
failed = true;
break;
}
executed.push(step.clone());
}
let result = if failed {
let remaining: Vec<RecoveryStep> = recipe.steps[executed.len()..].to_vec();
if executed.is_empty() {
RecoveryResult::EscalationRequired {
reason: format!("recovery failed at first step for {}", scenario),
}
} else {
RecoveryResult::PartialRecovery {
recovered: executed,
remaining,
}
}
} else {
RecoveryResult::Recovered {
steps_taken: recipe.steps.len() as u32,
}
};
// Emit the attempt as structured event data.
ctx.events.push(RecoveryEvent::RecoveryAttempted {
scenario: *scenario,
recipe,
result: result.clone(),
});
match &result {
RecoveryResult::Recovered { .. } => {
ctx.events.push(RecoveryEvent::RecoverySucceeded);
}
RecoveryResult::PartialRecovery { .. } => {
ctx.events.push(RecoveryEvent::RecoveryFailed);
}
RecoveryResult::EscalationRequired { .. } => {
ctx.events.push(RecoveryEvent::Escalated);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn each_scenario_has_a_matching_recipe() {
// given
let scenarios = FailureScenario::all();
// when / then
for scenario in scenarios {
let recipe = recipe_for(scenario);
assert_eq!(
recipe.scenario, *scenario,
"recipe scenario should match requested scenario"
);
assert!(
!recipe.steps.is_empty(),
"recipe for {} should have at least one step",
scenario
);
assert!(
recipe.max_attempts >= 1,
"recipe for {} should allow at least one attempt",
scenario
);
}
}
#[test]
fn successful_recovery_returns_recovered_and_emits_events() {
// given
let mut ctx = RecoveryContext::new();
let scenario = FailureScenario::TrustPromptUnresolved;
// when
let result = attempt_recovery(&scenario, &mut ctx);
// then
assert_eq!(result, RecoveryResult::Recovered { steps_taken: 1 });
assert_eq!(ctx.events().len(), 2);
assert!(matches!(
&ctx.events()[0],
RecoveryEvent::RecoveryAttempted {
scenario: s,
result: r,
..
} if *s == FailureScenario::TrustPromptUnresolved
&& matches!(r, RecoveryResult::Recovered { steps_taken: 1 })
));
assert_eq!(ctx.events()[1], RecoveryEvent::RecoverySucceeded);
}
#[test]
fn escalation_after_max_attempts_exceeded() {
// given
let mut ctx = RecoveryContext::new();
let scenario = FailureScenario::PromptMisdelivery;
// when — first attempt succeeds
let first = attempt_recovery(&scenario, &mut ctx);
assert!(matches!(first, RecoveryResult::Recovered { .. }));
// when — second attempt should escalate
let second = attempt_recovery(&scenario, &mut ctx);
// then
assert!(
matches!(
&second,
RecoveryResult::EscalationRequired { reason }
if reason.contains("max recovery attempts")
),
"second attempt should require escalation, got: {second:?}"
);
assert_eq!(ctx.attempt_count(&scenario), 1);
assert!(ctx
.events()
.iter()
.any(|e| matches!(e, RecoveryEvent::Escalated)));
}
#[test]
fn partial_recovery_when_step_fails_midway() {
// given — PartialPluginStartup has two steps; fail at step index 1
let mut ctx = RecoveryContext::new().with_fail_at_step(1);
let scenario = FailureScenario::PartialPluginStartup;
// when
let result = attempt_recovery(&scenario, &mut ctx);
// then
match &result {
RecoveryResult::PartialRecovery {
recovered,
remaining,
} => {
assert_eq!(recovered.len(), 1, "one step should have succeeded");
assert_eq!(remaining.len(), 1, "one step should remain");
assert!(matches!(recovered[0], RecoveryStep::RestartPlugin { .. }));
assert!(matches!(
remaining[0],
RecoveryStep::RetryMcpHandshake { .. }
));
}
other => panic!("expected PartialRecovery, got {other:?}"),
}
assert!(ctx
.events()
.iter()
.any(|e| matches!(e, RecoveryEvent::RecoveryFailed)));
}
#[test]
fn first_step_failure_escalates_immediately() {
// given — fail at step index 0
let mut ctx = RecoveryContext::new().with_fail_at_step(0);
let scenario = FailureScenario::CompileRedCrossCrate;
// when
let result = attempt_recovery(&scenario, &mut ctx);
// then
assert!(
matches!(
&result,
RecoveryResult::EscalationRequired { reason }
if reason.contains("failed at first step")
),
"zero-step failure should escalate, got: {result:?}"
);
assert!(ctx
.events()
.iter()
.any(|e| matches!(e, RecoveryEvent::Escalated)));
}
#[test]
fn emitted_events_include_structured_attempt_data() {
// given
let mut ctx = RecoveryContext::new();
let scenario = FailureScenario::McpHandshakeFailure;
// when
let _ = attempt_recovery(&scenario, &mut ctx);
// then — verify the RecoveryAttempted event carries full context
let attempted = ctx
.events()
.iter()
.find(|e| matches!(e, RecoveryEvent::RecoveryAttempted { .. }))
.expect("should have emitted RecoveryAttempted event");
match attempted {
RecoveryEvent::RecoveryAttempted {
scenario: s,
recipe,
result,
} => {
assert_eq!(*s, scenario);
assert_eq!(recipe.scenario, scenario);
assert!(!recipe.steps.is_empty());
assert!(matches!(result, RecoveryResult::Recovered { .. }));
}
_ => unreachable!(),
}
// Verify the event is serializable as structured JSON
let json = serde_json::to_string(&ctx.events()[0])
.expect("recovery event should be serializable to JSON");
assert!(
json.contains("mcp_handshake_failure"),
"serialized event should contain scenario name"
);
}
#[test]
fn recovery_context_tracks_attempts_per_scenario() {
// given
let mut ctx = RecoveryContext::new();
// when
assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 0);
attempt_recovery(&FailureScenario::StaleBranch, &mut ctx);
// then
assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 1);
assert_eq!(ctx.attempt_count(&FailureScenario::PromptMisdelivery), 0);
}
#[test]
fn stale_branch_recipe_has_rebase_then_clean_build() {
// given
let recipe = recipe_for(&FailureScenario::StaleBranch);
// then
assert_eq!(recipe.steps.len(), 2);
assert_eq!(recipe.steps[0], RecoveryStep::RebaseBranch);
assert_eq!(recipe.steps[1], RecoveryStep::CleanBuild);
}
#[test]
fn partial_plugin_startup_recipe_has_restart_then_handshake() {
// given
let recipe = recipe_for(&FailureScenario::PartialPluginStartup);
// then
assert_eq!(recipe.steps.len(), 2);
assert!(matches!(
recipe.steps[0],
RecoveryStep::RestartPlugin { .. }
));
assert!(matches!(
recipe.steps[1],
RecoveryStep::RetryMcpHandshake { timeout: 3000 }
));
assert_eq!(recipe.escalation_policy, EscalationPolicy::LogAndContinue);
}
#[test]
fn failure_scenario_display_all_variants() {
// given
let cases = [
(
FailureScenario::TrustPromptUnresolved,
"trust_prompt_unresolved",
),
(FailureScenario::PromptMisdelivery, "prompt_misdelivery"),
(FailureScenario::StaleBranch, "stale_branch"),
(
FailureScenario::CompileRedCrossCrate,
"compile_red_cross_crate",
),
(
FailureScenario::McpHandshakeFailure,
"mcp_handshake_failure",
),
(
FailureScenario::PartialPluginStartup,
"partial_plugin_startup",
),
];
// when / then
for (scenario, expected) in &cases {
assert_eq!(scenario.to_string(), *expected);
}
}
#[test]
fn multi_step_success_reports_correct_steps_taken() {
// given — StaleBranch has 2 steps, no simulated failure
let mut ctx = RecoveryContext::new();
let scenario = FailureScenario::StaleBranch;
// when
let result = attempt_recovery(&scenario, &mut ctx);
// then
assert_eq!(result, RecoveryResult::Recovered { steps_taken: 2 });
}
#[test]
fn mcp_handshake_recipe_uses_abort_escalation_policy() {
// given
let recipe = recipe_for(&FailureScenario::McpHandshakeFailure);
// then
assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort);
assert_eq!(recipe.max_attempts, 1);
}
}