From 77fcf4e470524ef1a9343abb7fb9778f2be7876d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 17 Apr 2026 11:13:59 -0700 Subject: [PATCH 1/5] rough first pass --- clients/sled-agent-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 2 + dev-tools/omdb/src/bin/omdb/db.rs | 18 ++++++- nexus/db-model/src/vmm.rs | 51 ++++++++++++------- nexus/db-queries/src/db/datastore/instance.rs | 8 +++ nexus/db-queries/src/db/datastore/vmm.rs | 7 +++ nexus/db-schema/src/schema.rs | 1 + .../background/tasks/abandoned_vmm_reaper.rs | 1 + .../tasks/instance_reincarnation.rs | 1 + .../app/background/tasks/instance_watcher.rs | 5 ++ nexus/src/app/instance.rs | 1 + nexus/src/app/sagas/instance_update/mod.rs | 3 ++ nexus/tests/integration_tests/instances.rs | 2 +- schema/crdb/dbinit.sql | 13 ++++- sled-agent/src/common/instance.rs | 2 + sled-agent/src/sim/collection.rs | 1 + 16 files changed, 96 insertions(+), 21 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 0bc41e434bd..d4b4b1e832c 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -162,6 +162,7 @@ impl From state: s.state.into(), generation: s.gen_, time_updated: s.time_updated, + failure_reason: None, // TODO(eliza) } } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 5139e7f347f..a7ade5d91d2 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -103,6 +103,8 @@ pub struct VmmRuntimeState { pub generation: Generation, /// Timestamp for the VMM's state. pub time_updated: DateTime, + /// A human-readable description of why this VMM is in the 'failed' state. + pub failure_reason: Option, } /// A wrapper type containing a sled's total knowledge of the state of a VMM. diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 3c9d4baeaa7..d617c9c80d2 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -5061,7 +5061,7 @@ async fn cmd_db_instance_info( let ctx = || "listing past VMMs"; #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct VmmRow { + struct VmmRow<'a> { #[tabled(inline)] state: VmmStateRow, sled_id: SledUuid, @@ -5069,6 +5069,8 @@ async fn cmd_db_instance_info( time_created: chrono::DateTime, #[tabled(display_with = "datetime_opt_rfc3339_concise")] time_deleted: Option>, + #[tabled(display_with = "display_option_blank")] + failure_reason: Option<&'a str>, } let vmms = vmm_dsl::vmm .filter(vmm_dsl::instance_id.eq(id.into_untyped_uuid())) @@ -5097,6 +5099,7 @@ async fn cmd_db_instance_info( time_state_updated: _, generation, state, + ref failure_reason, } = vmm; VmmRow { state: VmmStateRow { @@ -5107,6 +5110,7 @@ async fn cmd_db_instance_info( sled_id: sled_id.into(), time_created, time_deleted, + failure_reason: failure_reason.as_deref(), } })) .with(tabled::settings::Style::empty()) @@ -7699,13 +7703,14 @@ fn prettyprint_vmm( const ID: &'static str = "ID"; const CREATED: &'static str = "created at"; const DELETED: &'static str = "deleted at"; - const UPDATED: &'static str = "updated at"; + const UPDATED: &'static str = " updated at"; const INSTANCE_ID: &'static str = "instance ID"; const SLED_ID: &'static str = "sled ID"; const SLED_SERIAL: &'static str = "sled serial"; const CPU_PLATFORM: &'static str = "CPU platform"; const ADDRESS: &'static str = "propolis address"; const STATE: &'static str = "state"; + const FAILURE_REASON: &'static str = " failed because:"; const WIDTH: usize = const_max_len(&[ ID, CREATED, @@ -7717,6 +7722,7 @@ fn prettyprint_vmm( CPU_PLATFORM, STATE, ADDRESS, + FAILURE_REASON, ]); let width = std::cmp::max(width, Some(WIDTH)).unwrap_or(WIDTH); @@ -7732,6 +7738,7 @@ fn prettyprint_vmm( state, generation, time_state_updated, + failure_reason, } = vmm; println!("{indent}{ID:>width$}: {id}"); @@ -7743,6 +7750,9 @@ fn prettyprint_vmm( println!("{indent}{DELETED:width$}: {deleted}"); } println!("{indent}{STATE:>width$}: {state}"); + if let Some(reason) = failure_reason { + println!("{indent}{FAILURE_REASON:>width$}: {reason}"); + } let g = u64::from(generation.0); println!( "{indent}{UPDATED:>width$}: {time_state_updated:?} (generation {g})" @@ -7821,6 +7831,8 @@ async fn cmd_db_vmm_list( #[tabled(inline)] state: VmmStateRow, sled: &'a str, + #[tabled(display_with = "display_option_blank")] + failure_reason: Option<&'a str>, } impl<'a> From<&'a (Vmm, Option)> for VmmRow<'a> { @@ -7837,6 +7849,7 @@ async fn cmd_db_vmm_list( time_state_updated: _, generation, state, + ref failure_reason, } = vmm; let sled = match sled { Some(sled) => sled.serial_number(), @@ -7853,6 +7866,7 @@ async fn cmd_db_vmm_list( generation: generation.0.into(), }, sled, + failure_reason: failure_reason.as_deref(), } } } diff --git a/nexus/db-model/src/vmm.rs b/nexus/db-model/src/vmm.rs index b8ba2c1c1c2..7f62c2a20c9 100644 --- a/nexus/db-model/src/vmm.rs +++ b/nexus/db-model/src/vmm.rs @@ -17,6 +17,7 @@ use crate::typed_uuid::DbTypedUuid; use crate::{SqlU16, VmmCpuPlatform}; use chrono::{DateTime, Utc}; use nexus_db_schema::schema::vmm; +use omicron_common::api::internal::nexus; use omicron_uuid_kinds::*; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -72,6 +73,12 @@ pub struct Vmm { /// control plane if this VMM's instance didn't specify a required platform /// when it was started. pub cpu_platform: VmmCpuPlatform, + + /// A human-readable reason describing why this VMM is in the `Failed` state. + /// + /// This is not stable and is intended for debugging purposes only. It + /// should only be `Some` if the VMM's `state` is `Failed`. + pub failure_reason: Option, } impl Vmm { @@ -101,17 +108,18 @@ impl Vmm { propolis_port: SqlU16(propolis_port), state: VmmState::Creating, cpu_platform, + failure_reason: None, } } - /// Returns the runtime state of this VMM. - pub fn runtime(&self) -> VmmRuntimeState { - VmmRuntimeState { - time_state_updated: self.time_state_updated, - generation: self.generation, - state: self.state, - } - } + // /// Returns the runtime state of this VMM. + // pub fn runtime(&self) -> VmmRuntimeState { + // VmmRuntimeState { + // time_state_updated: self.time_state_updated, + // generation: self.generation, + // state: self.state, + // } + // } pub fn sled_id(&self) -> SledUuid { self.sled_id.into() @@ -143,18 +151,27 @@ pub struct VmmRuntimeState { /// The state of this VMM. If this VMM is the active VMM for a given /// instance, this state is the instance's logical state. pub state: VmmState, + + /// A human-readable reason describing why this VMM is in the `Failed` state. + /// + /// This is not stable and is intended for debugging purposes only. It + /// should only be `Some` if the VMM's `state` is `Failed`. + pub failure_reason: Option, } -impl From - for VmmRuntimeState -{ - fn from( - value: omicron_common::api::internal::nexus::VmmRuntimeState, - ) -> Self { +impl From for VmmRuntimeState { + fn from(value: nexus::VmmRuntimeState) -> Self { + let nexus::VmmRuntimeState { + state, + time_updated, + generation, + failure_reason, + } = value; Self { - state: value.state.into(), - time_state_updated: value.time_updated, - generation: value.generation.into(), + state: state.into(), + time_state_updated: time_updated, + generation: generation.into(), + failure_reason, } } } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index eb00f9bc6ac..c7b876a82fa 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -2991,6 +2991,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3052,6 +3053,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3148,6 +3150,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Stopped, + failure_reason: None, }, ) .await @@ -3187,6 +3190,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3224,6 +3228,7 @@ mod tests { &VmmRuntimeState { time_state_updated: Utc::now(), generation: Generation(vmm2.generation.0.next()), + failure_reason: None, state: VmmState::Running, }, ) @@ -3288,6 +3293,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3323,6 +3329,7 @@ mod tests { &VmmRuntimeState { time_state_updated: Utc::now(), generation: Generation(vmm2.generation.0.next().next()), + failure_reason: None, state: VmmState::SagaUnwound, }, ) @@ -3433,6 +3440,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 060a6409b45..09f61083877 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -476,6 +476,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -496,6 +497,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -531,6 +533,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation(vmm1.generation.0.next()), state: VmmState::Stopping, + failure_reason: None, }, Migrations { migration_in: None, @@ -553,6 +556,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation(vmm2.generation.0.next()), state: VmmState::Running, + failure_reason: None, }, Migrations { migration_in: Some(&vmm2_migration_in), @@ -605,6 +609,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -639,6 +644,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation(vmm2.generation.0.next()), state: VmmState::Destroyed, + failure_reason: None, }, Migrations { migration_in: Some(&vmm2_migration_in), @@ -663,6 +669,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation(vmm3.generation.0.next()), state: VmmState::Destroyed, + failure_reason: None, }, Migrations { migration_in: Some(&vmm3_migration_in), diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 2b4868fbe83..18b3546c21f 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -482,6 +482,7 @@ table! { propolis_port -> Int4, state -> crate::enums::VmmStateEnum, cpu_platform -> crate::enums::VmmCpuPlatformEnum, + failure_reason -> Nullable, } } joinable!(vmm -> sled (sled_id)); diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 6db4c8c38b2..40d6c04b2d7 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -250,6 +250,7 @@ mod tests { state: VmmState::Destroyed, time_state_updated: Utc::now(), generation: Generation::new(), + failure_reason: None, }), ) .await diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs index 8b497b618ef..1b67e86d402 100644 --- a/nexus/src/app/background/tasks/instance_reincarnation.rs +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -458,6 +458,7 @@ mod test { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::SagaUnwound, + failure_reason: None, }, ) .await diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 2c7d9ae4c47..8d276a11715 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -134,6 +134,7 @@ impl InstanceWatcher { generation: vmm.generation.0.next(), state: nexus::VmmState::Failed, time_updated: chrono::Utc::now(), + failure_reason: Some("sled expunged".to_string()), }, // It's fine to synthesize `None`s here because a `None` // just means "don't update the migration state", not @@ -173,6 +174,10 @@ impl InstanceWatcher { generation: vmm.generation.0.next(), state: nexus::VmmState::Failed, time_updated: chrono::Utc::now(), + failure_reason: Some( + "VMM no longer known to sled-agent" + .to_string(), + ), }, // It's fine to synthesize `None`s here because a `None` // just means "don't update the migration state", not diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index adc18bb12da..d7061760661 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1746,6 +1746,7 @@ impl super::Nexus { state: db::model::VmmState::Failed, time_state_updated: chrono::Utc::now(), generation: db::model::Generation(vmm.generation.next()), + failure_reason: Some(reason.to_string()), }; match self.db_datastore.vmm_update_runtime(&vmm_id, &new_runtime).await diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index a78be1f77d5..c1af58d332b 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1971,6 +1971,7 @@ mod test { time_state_updated: Utc::now(), generation: Generation(vmm.generation.0.next()), state: VmmState::Destroyed, + failure_reason: None, }, ) .await @@ -2631,6 +2632,7 @@ mod test { time_state_updated: Utc::now(), generation: Generation(src_vmm.generation.0.next()), state: vmm_state, + failure_reason: None, }; let migration = self @@ -2688,6 +2690,7 @@ mod test { time_state_updated: Utc::now(), generation: Generation(target_vmm.generation.0.next()), state: vmm_state, + failure_reason: None, }; let migration = self diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index e2d549c2f81..d1cce142602 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1656,7 +1656,7 @@ async fn test_instance_failed_by_instance_watcher_can_be_restarted( // Wait for the instance to transition to Failed. instance_wait_for_state(client, instance_id, InstanceState::Failed).await; - + panic!("lol"); // Now, the instance should be deleteable. expect_instance_delete_ok(&client, instance_name).await; } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c3a3959ee06..a535777193e 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5933,7 +5933,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.vmm ( propolis_ip INET NOT NULL, propolis_port INT4 NOT NULL CHECK (propolis_port BETWEEN 0 AND 65535) DEFAULT 12400, state omicron.public.vmm_state NOT NULL, - cpu_platform omicron.public.vmm_cpu_platform NOT NULL + cpu_platform omicron.public.vmm_cpu_platform NOT NULL, + -- A human-readable description of why this VMM is in the 'failed' state. + -- + -- This is not stable and is intended for debugging purposes only. The + -- `failure_reason_iff_failed` CHECK constraint ensures that this field is + -- only set when the VMM's state is set to 'failed'. + failure_reason TEXT, + + CONSTRAINT failure_reason_iff_failed CHECK ( + (failure_reason IS NOT NULL AND state = 'failed') + OR (failure_reason IS NULL) + ) ); CREATE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm ( diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 160cbe21db8..be658f2cd93 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -347,6 +347,7 @@ mod test { state: VmmState::Starting, generation: Generation::new(), time_updated: now, + failure_reason: None, }; InstanceStates::new(vmm, None) @@ -375,6 +376,7 @@ mod test { state: VmmState::Migrating, generation: Generation::new(), time_updated: now, + failure_reason: None, }; InstanceStates::new(vmm, Some(Uuid::new_v4())) diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 5a86d3c867a..49577d01fb6 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -385,6 +385,7 @@ mod test { state: VmmState::Starting, generation: Generation::new(), time_updated: Utc::now(), + failure_reason: None, }; let state = From 4717f133ff48a8e38870a99f3ef3cfd99b2c99d6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 17 Apr 2026 11:49:12 -0700 Subject: [PATCH 2/5] more --- nexus/src/app/instance.rs | 10 ++-- nexus/tests/integration_tests/instances.rs | 2 + sled-agent/src/common/instance.rs | 55 +++++++++++++++------- sled-agent/src/instance.rs | 39 ++++++++++----- sled-agent/src/sim/instance.rs | 4 +- 5 files changed, 76 insertions(+), 34 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index d7061760661..136cd445100 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1733,20 +1733,22 @@ impl super::Nexus { opctx: &OpContext, authz_instance: authz::Instance, vmm: &db::model::Vmm, - reason: &SledAgentInstanceError, + error: &SledAgentInstanceError, ) -> Result<(), Error> { let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); error!(self.log, "marking VMM failed due to sled agent API error"; "instance_id" => %instance_id, "vmm_id" => %vmm_id, - "error" => ?reason); + "error" => ?error); let new_runtime = VmmRuntimeState { state: db::model::VmmState::Failed, time_state_updated: chrono::Utc::now(), generation: db::model::Generation(vmm.generation.next()), - failure_reason: Some(reason.to_string()), + failure_reason: Some( + "VMM no longer known to sled-agent".to_string(), + ), }; match self.db_datastore.vmm_update_runtime(&vmm_id, &new_runtime).await @@ -1755,7 +1757,7 @@ impl super::Nexus { info!(self.log, "marked VMM as Failed, preparing update saga"; "instance_id" => %instance_id, "vmm_id" => %vmm_id, - "reason" => ?reason, + "error" => ?error, ); let saga = instance_update::SagaInstanceUpdate::prepare( &instance_update::Params { diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d1cce142602..c55af91454e 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1599,6 +1599,8 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped( instance_post(&client, instance_name, InstanceOp::Stop).await; assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + panic!("lmao"); + // Now, the Stopped nstance should be deleteable.. expect_instance_delete_ok(client, instance_name).await; } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index be658f2cd93..9e57dc422e9 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -94,7 +94,7 @@ pub enum ObservedMigrationStatus { } /// The information observed by the instance's Propolis state monitor. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] pub(crate) struct ObservedPropolisState { /// The state reported by Propolis's instance state monitor API. pub vmm_state: PropolisInstanceState, @@ -104,6 +104,7 @@ pub(crate) struct ObservedPropolisState { /// The approximate time at which this observation was made. pub time: DateTime, + pub failure_reason: Option, } impl ObservedPropolisState { @@ -111,6 +112,12 @@ impl ObservedPropolisState { /// state and an instance state monitor response received from /// Propolis. pub fn new(propolis_state: &InstanceStateMonitorResponse) -> Self { + let failure_reason = if propolis_state.state == PropolisApiState::Failed + { + Some("propolis reported the VM failed".to_string()) + } else { + None + }; Self { vmm_state: PropolisInstanceState(propolis_state.state), migration_in: propolis_state @@ -124,6 +131,7 @@ impl ObservedPropolisState { .as_ref() .map(ObservedMigrationState::from), time: Utc::now(), + failure_reason, } } } @@ -215,7 +223,7 @@ impl InstanceStates { /// Propolis. pub(crate) fn apply_propolis_observation( &mut self, - observed: &ObservedPropolisState, + observed: ObservedPropolisState, ) { fn transition_migration( current: &mut Option, @@ -262,10 +270,10 @@ impl InstanceStates { migration.state = MigrationState::Failed; } } - self.vmm.state = observed.vmm_state.into(); self.vmm.generation = self.vmm.generation.next(); self.vmm.time_updated = observed.time; + self.vmm.failure_reason = observed.failure_reason; // Update the instance record to reflect the result of any completed // migration. @@ -304,18 +312,22 @@ impl InstanceStates { /// may use this to force a VMM to reach this state when it knows no /// more state updates are forthcoming from Propolis (or when there might be /// updates, but the monitor has decided it no longer cares). - pub(crate) fn force_state_to_failed(&mut self) { - self.force_state_to(PropolisApiState::Failed); + pub(crate) fn force_state_to_failed(&mut self, reason: impl ToString) { + self.force_state_to(PropolisApiState::Failed, Some(reason.to_string())); } /// Forcibly transitions this VMM to the Destroyed state. This allows the /// instance runner to force the VMM to reach a terminal state if a client /// asks to stop that VMM before actually asking to start it. pub(crate) fn force_state_to_destroyed(&mut self) { - self.force_state_to(PropolisApiState::Destroyed); + self.force_state_to(PropolisApiState::Destroyed, None); } - fn force_state_to(&mut self, state: PropolisApiState) { + fn force_state_to( + &mut self, + state: PropolisApiState, + failure_reason: Option, + ) { // The callee interprets the `None` values in the migration state fields // as "no update," so this won't replace any in-progress migration data // (unless the supplied state is a terminal state, in which case any @@ -325,9 +337,10 @@ impl InstanceStates { migration_in: None, migration_out: None, time: Utc::now(), + failure_reason, }; - self.apply_propolis_observation(&fake_observed); + self.apply_propolis_observation(fake_observed); } } @@ -385,11 +398,17 @@ mod test { fn make_observed_state( propolis_state: PropolisInstanceState, ) -> ObservedPropolisState { + let failure_reason = if propolis_state.0 == PropolisApiState::Failed { + Some("(fake) propolis reported the VM failed".to_string()) + } else { + None + }; ObservedPropolisState { vmm_state: propolis_state, migration_in: None, migration_out: None, time: Utc::now(), + failure_reason, } } @@ -413,7 +432,7 @@ mod test { for state in [Observed::Destroyed, Observed::Failed] { let mut instance_state = make_instance(); instance_state - .apply_propolis_observation(&make_observed_state(state.into())); + .apply_propolis_observation(make_observed_state(state.into())); assert!(instance_state.vmm_is_halted()) } } @@ -425,7 +444,7 @@ mod test { let original_migration = instance_state.clone().migration_out.unwrap(); instance_state - .apply_propolis_observation(&make_observed_state(state.into())); + .apply_propolis_observation(make_observed_state(state.into())); let migration = instance_state .migration_out @@ -442,7 +461,7 @@ mod test { let original_migration = instance_state.clone().migration_in.unwrap(); instance_state - .apply_propolis_observation(&make_observed_state(state.into())); + .apply_propolis_observation(make_observed_state(state.into())); let migration = instance_state .migration_in @@ -467,13 +486,14 @@ mod test { }), migration_in: None, time: Utc::now(), + failure_reason: None, }; // This transition should transfer control to the target VMM without // actually marking the migration as completed. This advances the // instance's state generation. let prev = state.clone(); - state.apply_propolis_observation(&observed); + state.apply_propolis_observation(observed.clone()); assert!(!state.vmm_is_halted()); assert_state_change_has_gen_change(&prev, &state); @@ -493,7 +513,7 @@ mod test { // anymore. let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Stopped); - state.apply_propolis_observation(&observed); + state.apply_propolis_observation(observed.clone()); assert!(!state.vmm_is_halted()); assert_state_change_has_gen_change(&prev, &state); @@ -514,7 +534,7 @@ mod test { let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Destroyed); - state.apply_propolis_observation(&observed); + state.apply_propolis_observation(observed); assert!(state.vmm_is_halted()); assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.vmm.state, VmmState::Destroyed); @@ -551,10 +571,13 @@ mod test { }), migration_out: None, time: Utc::now(), + failure_reason: Some( + "(fake) propolis reported the VM failed".to_string(), + ), }; let prev = state.clone(); - state.apply_propolis_observation(&observed); + state.apply_propolis_observation(observed); assert!(state.vmm_is_halted()); assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.vmm.state, VmmState::Failed); @@ -576,7 +599,7 @@ mod test { let mut state = make_migration_target_instance(); let prev = state.clone(); - state.force_state_to_failed(); + state.force_state_to_failed("forcefully terminated"); assert_state_change_has_gen_change(&prev, &state); diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 1cedaa1faff..0ba662d1e35 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -693,7 +693,7 @@ impl InstanceRunner { "instance request channel unexpectedly closed" ); - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate("instance request channel closed before final state was published (possible sled-agent bug?)").await; break; } }; @@ -1021,7 +1021,7 @@ impl InstanceRunner { let InstanceMonitorMessage { update, tx } = request; let response = match update { InstanceMonitorUpdate::State(state) => { - self.observe_state(&ObservedPropolisState::new(&state)); + self.observe_state(ObservedPropolisState::new(&state)); // If Propolis still has an active VM, just publish this state // update and resume normal operation. Otherwise, go through the @@ -1040,7 +1040,10 @@ impl InstanceRunner { // There's no way to restore the zone at this point, so the // runner is now the sole driver of the VMM state machine. Drive // the VMM to Failed and go through the termination sequence. - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate( + "propolis zone has gone away entirely", + ) + .await; ControlFlow::Break(()) } InstanceMonitorUpdate::Error(PropolisErrorCode::NoInstance) => { @@ -1053,7 +1056,10 @@ impl InstanceRunner { // This error indicates that Propolis crashed and restarted and // lost whatever VM was previously created there. Nothing to do // but mark the VMM as failed and tear things down. - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate( + "propolis unexpectedly reported no instance", + ) + .await; ControlFlow::Break(()) } InstanceMonitorUpdate::Error( @@ -1084,7 +1090,7 @@ impl InstanceRunner { /// Processes a Propolis state change observed by the Propolis monitoring /// task. - fn observe_state(&mut self, state: &ObservedPropolisState) { + fn observe_state(&mut self, state: ObservedPropolisState) { info!(self.log, "observed new Propolis state"; "state" => ?state); // This shouldn't happen: the monitor can't observe anything before a @@ -2227,7 +2233,7 @@ impl InstanceRunner { // just move to Failed so that the corresponding instance // can be stopped. if self.running_state.is_none() { - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate("instance_ensure failed (no Propolis zone installed)").await; return Err(e); } } @@ -2241,7 +2247,7 @@ impl InstanceRunner { // this VMM should just move to Failed so that the // corresponding instance can be stopped. if self.running_state.is_none() { - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate("instance_ensure failed (no Propolis zone installed)").await; return Err(e); } } @@ -2303,7 +2309,10 @@ impl InstanceRunner { "code" => ?code, ); - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate( + "propolis denies knowledge of VM", + ) + .await; // Return the newly-installed Failed state here so it // doesn't get clobbered by the published VMM state @@ -2498,7 +2507,10 @@ impl InstanceRunner { ); } - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate( + "received request to terminate forcefully", + ) + .await; let result = tx .send(Ok(VmmUnregisterResponse { updated_runtime: Some(self.state.sled_instance_state()), @@ -2545,7 +2557,10 @@ impl InstanceRunner { ); } - self.fail_vmm_and_terminate().await; + self.fail_vmm_and_terminate( + "termination channel closed unexpectedly (possible propolis bug?)", + ) + .await; VmmStateOwner::Runner } } @@ -2553,8 +2568,8 @@ impl InstanceRunner { /// Forcibly moves this VMM to the Failed state, then goes through the /// runner termination sequence. - async fn fail_vmm_and_terminate(&mut self) { - self.state.force_state_to_failed(); + async fn fail_vmm_and_terminate(&mut self, reason: impl ToString) { + self.state.force_state_to_failed(reason); self.terminate().await; } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 16832b79110..dff8427429e 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -304,7 +304,7 @@ impl SimInstanceInner { } } - self.state.apply_propolis_observation(&ObservedPropolisState::new( + self.state.apply_propolis_observation(ObservedPropolisState::new( &self.last_response, )); @@ -389,7 +389,7 @@ impl SimInstanceInner { /// Simulates rude termination by moving the instance to the Failed state /// immediately and clearing the queue of pending state transitions. fn terminate(&mut self) -> SledVmmState { - self.state.force_state_to_failed(); + self.state.force_state_to_failed("forcefully terminated"); self.queue.clear(); self.destroyed = true; self.state.sled_instance_state() From f8b0c1169bb326c38a2d128a14d6aa507010a780 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 17 Apr 2026 14:38:51 -0700 Subject: [PATCH 3/5] remove before flight --- nexus/tests/integration_tests/instances.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index c55af91454e..e2d549c2f81 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1599,8 +1599,6 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped( instance_post(&client, instance_name, InstanceOp::Stop).await; assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); - panic!("lmao"); - // Now, the Stopped nstance should be deleteable.. expect_instance_delete_ok(client, instance_name).await; } @@ -1658,7 +1656,7 @@ async fn test_instance_failed_by_instance_watcher_can_be_restarted( // Wait for the instance to transition to Failed. instance_wait_for_state(client, instance_id, InstanceState::Failed).await; - panic!("lol"); + // Now, the instance should be deleteable. expect_instance_delete_ok(&client, instance_name).await; } From c5b3d2d68670ba8ec59c1697b3689250ff4b9d7f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 17 Apr 2026 15:38:38 -0700 Subject: [PATCH 4/5] migration --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/dbinit.sql | 2 +- schema/crdb/vmm-failure-reason/up1.sql | 2 ++ schema/crdb/vmm-failure-reason/up2.sql | 6 ++++++ schema/crdb/vmm-failure-reason/up3.sql | 5 +++++ 5 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/vmm-failure-reason/up1.sql create mode 100644 schema/crdb/vmm-failure-reason/up2.sql create mode 100644 schema/crdb/vmm-failure-reason/up3.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 126430bb5c8..2efa3d50d8e 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(250, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(251, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(251, "vmm-failure-reason"), KnownVersion::new(250, "inv-svc-enabled-not-online"), KnownVersion::new(249, "fm-support-bundle-request"), KnownVersion::new(248, "cleanup-orphaned-subnet-pool-silo-links"), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a535777193e..1e1dda981f8 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -8469,7 +8469,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '250.0.0', NULL) + (TRUE, NOW(), NOW(), '251.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/vmm-failure-reason/up1.sql b/schema/crdb/vmm-failure-reason/up1.sql new file mode 100644 index 00000000000..fa47ef53212 --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up1.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.vmm + ADD COLUMN IF NOT EXISTS failure_reason TEXT; diff --git a/schema/crdb/vmm-failure-reason/up2.sql b/schema/crdb/vmm-failure-reason/up2.sql new file mode 100644 index 00000000000..93a38e3f734 --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up2.sql @@ -0,0 +1,6 @@ +SET LOCAL disallow_full_table_scans = off; + +UPDATE omicron.public.vmm + SET failure_reason = 'unknown' + WHERE state = 'failed' + AND failure_reason IS NULL; diff --git a/schema/crdb/vmm-failure-reason/up3.sql b/schema/crdb/vmm-failure-reason/up3.sql new file mode 100644 index 00000000000..dd567585d7b --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up3.sql @@ -0,0 +1,5 @@ +ALTER TABLE omicron.public.vmm + ADD CONSTRAINT IF NOT EXISTS failure_reason_iff_failed CHECK ( + (failure_reason IS NOT NULL AND state = 'failed') + OR (failure_reason IS NULL) + ); From 1f089032d1b65dbef7f0234d493567bfb1afb95c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 17 Apr 2026 15:57:57 -0700 Subject: [PATCH 5/5] oh of course there's this --- sled-agent/src/instance.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 0ba662d1e35..4a67e198a75 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -3029,6 +3029,7 @@ mod tests { state: VmmState::Starting, generation: Generation::new(), time_updated: Default::default(), + failure_reason: None, }, propolis_addr, migration_id: None,