From b14d5a8d8cce25e5933b414ca4190dd98dfcebb2 Mon Sep 17 00:00:00 2001 From: Daniil Antoshin Date: Fri, 24 Apr 2026 09:44:59 +0200 Subject: [PATCH 1/4] fix(vmop): restore pending migration reason Signed-off-by: Daniil Antoshin --- .../migration/internal/handler/lifecycle.go | 13 ++++++--- .../internal/handler/lifecycle_test.go | 27 +++++++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go index aeaa944471..b03ddf30c6 100644 --- a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go +++ b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go @@ -47,6 +47,7 @@ import ( const lifecycleHandlerName = "LifecycleHandler" const ( + progressMigrationPending int32 = 0 progressDisksPreparing int32 = 1 progressTargetScheduling int32 = 2 progressTargetPreparing int32 = 3 @@ -56,6 +57,7 @@ const ( ) const ( + messageMigrationPending = "The VirtualMachineOperation for migrating the virtual machine has been queued. Waiting for the queue to be processed and for this operation to be executed." messageSyncingSourceAndTarget = "Syncing source and target" messageTargetPodScheduling = "Target pod is being scheduled" messageTargetPodPreparing = "Target pod is being prepared" @@ -339,7 +341,7 @@ func (h LifecycleHandler) syncOperationComplete(ctx context.Context, vmop *v1alp } vmop.Status.Phase = v1alpha2.VMOPPhaseInProgress - if reason == vmopcondition.ReasonTargetScheduling { + if reason == vmopcondition.ReasonMigrationPending || reason == vmopcondition.ReasonTargetScheduling { vmop.Status.Phase = v1alpha2.VMOPPhasePending } progress := h.calculateMigrationProgress(vmop, mig, reason) @@ -456,7 +458,7 @@ func (h LifecycleHandler) execute(ctx context.Context, vmop *v1alpha2.VirtualMac } vmop.Status.Phase = v1alpha2.VMOPPhaseInProgress - if reason == vmopcondition.ReasonTargetScheduling { + if reason == vmopcondition.ReasonMigrationPending || reason == vmopcondition.ReasonTargetScheduling { vmop.Status.Phase = v1alpha2.VMOPPhasePending } progress := h.calculateMigrationProgress(vmop, mig, reason) @@ -573,7 +575,10 @@ func (h LifecycleHandler) getInProgressReasonAndMessage( message := messageSyncingSourceAndTarget switch mig.Status.Phase { - case virtv1.MigrationPhaseUnset, virtv1.MigrationPending, virtv1.MigrationScheduling: + case virtv1.MigrationPhaseUnset, virtv1.MigrationPending: + reason = vmopcondition.ReasonMigrationPending + message = messageMigrationPending + case virtv1.MigrationScheduling: reason = vmopcondition.ReasonTargetScheduling message = messageTargetPodScheduling case virtv1.MigrationScheduled, virtv1.MigrationPreparingTarget: @@ -616,6 +621,8 @@ func (h LifecycleHandler) calculateMigrationProgress( reason vmopcondition.ReasonCompleted, ) int32 { switch reason { + case vmopcondition.ReasonMigrationPending: + return progressMigrationPending case vmopcondition.ReasonDisksPreparing: return progressDisksPreparing case vmopcondition.ReasonTargetScheduling: diff --git a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go index 827f87ea1d..30c68987d9 100644 --- a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go +++ b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go @@ -391,11 +391,11 @@ var _ = Describe("LifecycleHandler", func() { Expect(err).NotTo(HaveOccurred()) Expect(reason).To(Equal(expectedReason)) }, - Entry("phase unset means target scheduling", + Entry("phase unset means migration pending", virtv1.MigrationPhaseUnset, nil, nil, - vmopcondition.ReasonTargetScheduling, + vmopcondition.ReasonMigrationPending, ), Entry("scheduled means target preparing", virtv1.MigrationScheduled, @@ -453,6 +453,7 @@ var _ = Describe("LifecycleHandler", func() { Expect(h.calculateMigrationProgress(vmop, mig, reason)).To(Equal(expected)) }, + Entry("migration pending", vmopcondition.ReasonMigrationPending, nil, int32(0)), Entry("disks preparing", vmopcondition.ReasonDisksPreparing, nil, int32(1)), Entry("target scheduling", vmopcondition.ReasonTargetScheduling, nil, int32(2)), Entry("target unschedulable", vmopcondition.ReasonTargetUnschedulable, nil, int32(2)), @@ -513,6 +514,28 @@ var _ = Describe("LifecycleHandler", func() { Expect(srv.Changed().Status.Progress).To(Equal("2%")) }) + It("should set migration pending reason and zero progress before scheduling starts", func() { + vm := newVM(v1alpha2.PreferSafeMigrationPolicy) + vmop := newVMOPMigrate() + vmop.Status.Phase = v1alpha2.VMOPPhaseInProgress + + mig := newSimpleMigration(fmt.Sprintf("vmop-%s", vmop.Name), name) + mig.Status.Phase = virtv1.MigrationPending + + fakeClient, srv = setupEnvironment(vmop, vm, mig) + migrationService := service.NewMigrationService(fakeClient, featuregates.Default()) + base := genericservice.NewBaseVMOPService(fakeClient, recorderMock) + h := NewLifecycleHandler(fakeClient, migrationService, base, recorderMock) + + _, err := h.Handle(ctx, srv.Changed()) + Expect(err).NotTo(HaveOccurred()) + Expect(srv.Changed().Status.Phase).To(Equal(v1alpha2.VMOPPhasePending)) + Expect(srv.Changed().Status.Progress).To(Equal("0%")) + completed, found := conditions.GetCondition(vmopcondition.TypeCompleted, srv.Changed().Status.Conditions) + Expect(found).To(BeTrue()) + Expect(completed.Reason).To(Equal(vmopcondition.ReasonMigrationPending.String())) + }) + It("should set aborted reason and preserve progress for failed migration", func() { vm := newVM(v1alpha2.PreferSafeMigrationPolicy) vmop := newVMOPMigrate() From 3dccaf8ab531cb4f791e2cfc50d28c6ecda966bc Mon Sep 17 00:00:00 2001 From: Daniil Antoshin Date: Fri, 24 Apr 2026 14:19:30 +0200 Subject: [PATCH 2/4] fix(vmop): treat target scheduling as in progress Signed-off-by: Daniil Antoshin --- .../pkg/controller/vm/internal/migrating.go | 5 ++++- .../controller/vm/internal/migrating_test.go | 19 +++++++++++++++++++ .../migration/internal/handler/lifecycle.go | 4 ++-- .../internal/handler/lifecycle_test.go | 6 +++--- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/migrating.go b/images/virtualization-artifact/pkg/controller/vm/internal/migrating.go index 5346852c77..7ea604eb01 100644 --- a/images/virtualization-artifact/pkg/controller/vm/internal/migrating.go +++ b/images/virtualization-artifact/pkg/controller/vm/internal/migrating.go @@ -162,9 +162,12 @@ func (h *MigratingHandler) syncMigrating(ctx context.Context, s state.VirtualMac completed, _ := conditions.GetCondition(vmopcondition.TypeCompleted, vmop.Status.Conditions) switch completed.Reason { - case vmopcondition.ReasonMigrationPending.String(), vmopcondition.ReasonTargetScheduling.String(): + case vmopcondition.ReasonMigrationPending.String(): cb.Message("Migration is awaiting start.") + case vmopcondition.ReasonTargetScheduling.String(): + cb.Message("Migration is in progress: target pod is being scheduled.") + case vmopcondition.ReasonQuotaExceeded.String(): cb.Message(fmt.Sprintf("Migration is pending: %s.", completed.Message)) diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/migrating_test.go b/images/virtualization-artifact/pkg/controller/vm/internal/migrating_test.go index ecf10128e0..6eef76e17f 100644 --- a/images/virtualization-artifact/pkg/controller/vm/internal/migrating_test.go +++ b/images/virtualization-artifact/pkg/controller/vm/internal/migrating_test.go @@ -180,6 +180,25 @@ var _ = Describe("MigratingHandler", func() { Expect(cond.Message).To(Equal("Migration is awaiting start.")) }) + It("Should set active progress message when vmop is in progress with target scheduling reason", func() { + vm := newVM() + kvvmi := newKVVMI(nil) + vmop := newVMOP(v1alpha2.VMOPPhaseInProgress, vmopcondition.ReasonTargetScheduling.String(), true) + fakeClient, resource, vmState = setupEnvironment(vm, kvvmi, vmop) + + reconcile() + + newVM := &v1alpha2.VirtualMachine{} + err := fakeClient.Get(ctx, client.ObjectKeyFromObject(vm), newVM) + Expect(err).NotTo(HaveOccurred()) + + cond, exists := conditions.GetCondition(vmcondition.TypeMigrating, newVM.Status.Conditions) + Expect(exists).To(BeTrue()) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal(vmcondition.ReasonMigratingPending.String())) + Expect(cond.Message).To(Equal("Migration is in progress: target pod is being scheduled.")) + }) + It("Should set active progress message when vmop is in progress with target preparing reason", func() { vm := newVM() kvvmi := newKVVMI(nil) diff --git a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go index b03ddf30c6..9bb21ce859 100644 --- a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go +++ b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle.go @@ -341,7 +341,7 @@ func (h LifecycleHandler) syncOperationComplete(ctx context.Context, vmop *v1alp } vmop.Status.Phase = v1alpha2.VMOPPhaseInProgress - if reason == vmopcondition.ReasonMigrationPending || reason == vmopcondition.ReasonTargetScheduling { + if reason == vmopcondition.ReasonMigrationPending { vmop.Status.Phase = v1alpha2.VMOPPhasePending } progress := h.calculateMigrationProgress(vmop, mig, reason) @@ -458,7 +458,7 @@ func (h LifecycleHandler) execute(ctx context.Context, vmop *v1alpha2.VirtualMac } vmop.Status.Phase = v1alpha2.VMOPPhaseInProgress - if reason == vmopcondition.ReasonMigrationPending || reason == vmopcondition.ReasonTargetScheduling { + if reason == vmopcondition.ReasonMigrationPending { vmop.Status.Phase = v1alpha2.VMOPPhasePending } progress := h.calculateMigrationProgress(vmop, mig, reason) diff --git a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go index 30c68987d9..e821efab49 100644 --- a/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go +++ b/images/virtualization-artifact/pkg/controller/vmop/migration/internal/handler/lifecycle_test.go @@ -210,7 +210,7 @@ var _ = Describe("LifecycleHandler", func() { ), ) - It("should keep migration scheduling pending after migration starts", func() { + It("should keep migration scheduling in progress after migration starts", func() { vm := newVM(v1alpha2.PreferSafeMigrationPolicy) vm.Status.Conditions = []metav1.Condition{{ Type: string(vmcondition.TypeMigrating), @@ -239,7 +239,7 @@ var _ = Describe("LifecycleHandler", func() { _, err := h.Handle(ctx, srv.Changed()) Expect(err).NotTo(HaveOccurred()) - Expect(srv.Changed().Status.Phase).To(Equal(v1alpha2.VMOPPhasePending)) + Expect(srv.Changed().Status.Phase).To(Equal(v1alpha2.VMOPPhaseInProgress)) Expect(srv.Changed().Status.Progress).To(Equal("2%")) completed, found := conditions.GetCondition(vmopcondition.TypeCompleted, srv.Changed().Status.Conditions) Expect(found).To(BeTrue()) @@ -510,7 +510,7 @@ var _ = Describe("LifecycleHandler", func() { _, err := h.Handle(ctx, srv.Changed()) Expect(err).NotTo(HaveOccurred()) - Expect(srv.Changed().Status.Phase).To(Equal(v1alpha2.VMOPPhasePending)) + Expect(srv.Changed().Status.Phase).To(Equal(v1alpha2.VMOPPhaseInProgress)) Expect(srv.Changed().Status.Progress).To(Equal("2%")) }) From b2c19a20dc58568fa43710eb0120db4e37149f32 Mon Sep 17 00:00:00 2001 From: Daniil Antoshin Date: Fri, 24 Apr 2026 15:42:14 +0200 Subject: [PATCH 3/4] docs(vmop): describe migration lifecycle status Signed-off-by: Daniil Antoshin --- docs/internal/vmop_migration_lifecycle.md | 237 ++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 docs/internal/vmop_migration_lifecycle.md diff --git a/docs/internal/vmop_migration_lifecycle.md b/docs/internal/vmop_migration_lifecycle.md new file mode 100644 index 0000000000..ded860a383 --- /dev/null +++ b/docs/internal/vmop_migration_lifecycle.md @@ -0,0 +1,237 @@ +# VMOP migration lifecycle, reasons, and progress + +This document describes the internal status model used by `VirtualMachineOperation` migration handling. +It is intended for controller maintainers and reviewers. + +## Scope + +The VMOP migration controller exposes migration state through: + +- `status.phase` +- `status.progress` +- `status.conditions[Completed].reason` +- `status.conditions[Completed].message` + +The controller does not mirror KubeVirt migration phases one-to-one. It converts KubeVirt phases, target pod state, migration state, and transfer metrics into user-facing VMOP reasons and progress values. + +## In-progress lifecycle + +| KubeVirt migration state | VMOP Completed reason | VMOP phase | Progress | +|---|---|---|---:| +| `MigrationPhaseUnset` | `MigrationPending` | `Pending` | `0%` | +| `MigrationPending` | `MigrationPending` | `Pending` | `0%` | +| `MigrationScheduling` | `TargetScheduling` | `InProgress` | `2%` | +| target pod is unschedulable | `TargetUnschedulable` | `InProgress` | `2%` | +| `MigrationScheduled` | `TargetPreparing` | `InProgress` | `3%` | +| `MigrationPreparingTarget` | `TargetPreparing` | `InProgress` | `3%` | +| target pod has disk attach or mount errors | `TargetDiskError` | `InProgress` | `3%` | +| `MigrationTargetReady` | `Syncing` | `InProgress` | dynamic, `10..90%` | +| `MigrationWaitingForSync` | `Syncing` | `InProgress` | dynamic, `10..90%` | +| `MigrationSynchronizing` | `Syncing` | `InProgress` | dynamic, `10..90%` | +| `MigrationRunning` | `Syncing` | `InProgress` | dynamic, `10..90%` | +| sync progress stalls at maximum throttle | `NotConverging` | `InProgress` | dynamic, `10..90%` | +| `MigrationState.Completed == true` | `SourceSuspended` | `InProgress` | `91%` | +| `TargetNodeDomainReadyTimestamp != nil` | `TargetResumed` | `InProgress` | `92%` | +| `MigrationSucceeded` | `Completed` | `Completed` | `100%` | + +## Reason semantics + +| Reason | Meaning | +|---|---| +| `MigrationPending` | Migration object exists or is about to be processed, but target scheduling has not started yet. | +| `TargetScheduling` | Target pod scheduling has started. The operation is already active, so VMOP phase is `InProgress`. | +| `TargetUnschedulable` | The target pod is pending and has the Kubernetes `PodScheduled=False, Unschedulable` condition. | +| `TargetPreparing` | The target pod has been scheduled and is being prepared. | +| `TargetDiskError` | The target pod is stuck on a disk, volume, CSI, attach, or mount problem. | +| `Syncing` | Source and target are synchronizing migration data. | +| `NotConverging` | The sync phase is not making enough progress to converge. | +| `SourceSuspended` | Source VM has been suspended as part of the final migration handoff. | +| `TargetResumed` | Target VM has resumed on the target node. | +| `Completed` | Migration completed successfully. | +| `Aborted` | Migration was aborted. | +| `Failed` | Migration failed for an unspecified reason. | + +## Progress model + +Fixed progress values: + +| Reason | Progress | +|---|---:| +| `MigrationPending` | `0%` | +| `DisksPreparing` | `1%` | +| `TargetScheduling` | `2%` | +| `TargetUnschedulable` | `2%` | +| `TargetPreparing` | `3%` | +| `TargetDiskError` | `3%` | +| `SourceSuspended` | `91%` | +| `TargetResumed` | `92%` | +| `Completed` | `100%` | + +Dynamic progress values: + +| Reason | Progress source | +|---|---| +| `Syncing` | `migration/internal/progress.Progress.SyncProgress` | +| `NotConverging` | `migration/internal/progress.Progress.SyncProgress` | + +Fallback behavior: + +| Reason | Progress behavior | +|---|---| +| `Aborted` | Keep current `vmop.status.progress`. | +| `Failed` | Keep current `vmop.status.progress`. | +| unknown reason | Keep current `vmop.status.progress`. | + +## Sync progress details + +Sync progress is intentionally conservative and uses the `10..90%` range. + +The sync strategy has two stages: + +| Stage | Range | How it is selected | +|---|---:|---| +| bulk stage | `10..45%` | Default sync stage when no iterative migration metrics are available. | +| iterative stage | `45..90%` | Enabled only when `transferStatus.iteration > 0`. | + +The controller builds a progress record from: + +- `migrationState.startTimestamp` +- `migrationState.mode` +- `migrationState.transferStatus.iteration` +- `migrationState.transferStatus.autoConvergeThrottle` +- `migrationState.transferStatus.dataTotalBytes` +- `migrationState.transferStatus.dataProcessedBytes` +- `migrationState.transferStatus.dataRemainingBytes` +- `migrationState.migrationConfiguration.allowAutoConverge` +- previous `vmop.status.progress` + +If `transferStatus.iteration` is absent or zero, the migration remains in the bulk stage and progress is capped at `45%` until the lifecycle advances to `SourceSuspended`, `TargetResumed`, or `Completed`. + +This means that many fast or metric-poor migrations may visually stay below `50%` and then jump to `91%`, `92%`, or `100%`. + +## NotConverging detection + +`NotConverging` can appear in two ways. + +### In-progress detection + +During `Syncing`, the controller calls `progressStrategy.IsNotConverging(record)`. + +The strategy returns `true` only when all conditions are met: + +1. There is stored progress state for the VMOP. +2. The migration has entered the iterative stage. +3. Migration is at maximum throttle: + - if auto-converge is disabled, it is treated as maximum throttle; + - if auto-converge is enabled, `transferStatus.autoConvergeThrottle >= 99` is required. +4. The minimum observed remaining data has not improved for at least `10s`. + +When this happens, the VMOP stays `InProgress`, but `Completed.reason` becomes `NotConverging` and the message becomes: + +```text +Migration is not converging: data remaining is not decreasing at maximum throttle +``` + +### Terminal failed detection + +When KubeVirt reports `MigrationFailed`, `getFailedReason` also maps the failure to `NotConverging` if `migrationState.failureReason` contains: + +- `converg` +- `progress` + +If the terminal failure is otherwise generic but the previous VMOP `Completed.reason` was `NotConverging`, the controller keeps `NotConverging` as the final reason. + +## Failed migration handling + +When KubeVirt reports `MigrationFailed`, the controller sets: + +- `vmop.status.phase = Failed` +- `conditions[Completed].status = False` +- `conditions[Completed].reason = ` +- `status.progress = ` + +Failure reason classification order: + +| Priority | Condition | Final reason | Progress | +|---:|---|---|---:| +| 1 | `migrationState.abortRequested == true` or `abortStatus == MigrationAbortSucceeded` | `Aborted` | keep current | +| 2 | `migrationState.failureReason` contains `converg` or `progress` | `NotConverging` | dynamic, `10..90%` | +| 3 | failed condition reason or message contains `schedul` or `unschedul` | `TargetUnschedulable` | `2%` | +| 4 | failed condition reason or message contains `csi`, `attach`, `volume`, or `disk` | `TargetDiskError` | `3%` | +| 5 | no specific match | `Failed` | keep current | + +Failure message base by reason: + +| Reason | Message base | +|---|---| +| `Aborted` | `Migration aborted` | +| `NotConverging` | `Migration did not converge` | +| `TargetUnschedulable` | `Migration failed: target pod is unschedulable` | +| `TargetDiskError` | `Migration failed: target disk attach error` | +| `Failed` | `Migration failed` | + +The controller appends additional details to the base message from: + +1. `migrationState.failureReason`, if present; +2. otherwise, the KubeVirt `VirtualMachineInstanceMigrationFailed` condition message, if present. + +## Target pod diagnostics + +While migration is in progress, target pod diagnostics can override the phase-based reason. + +### Unschedulable target pod + +If the target pod is pending and has: + +```text +PodScheduled=False, Reason=Unschedulable +``` + +then VMOP uses: + +- reason: `TargetUnschedulable` +- progress: `2%` +- message: `Target pod "/" is unschedulable` + +### Disk attach or mount error + +If the target pod is in container creation and warning events include: + +- `FailedAttachVolume` +- `FailedMount` + +then VMOP uses: + +- reason: `TargetDiskError` +- progress: `3%` +- message: `Target pod has disk attach error: : ` + +## VM condition projection + +The VM controller projects VMOP migration reasons to the VM `Migrating` condition. + +Important messages: + +| VMOP reason | VM message | +|---|---| +| `MigrationPending` | `Migration is awaiting start.` | +| `TargetScheduling` | `Migration is in progress: target pod is being scheduled.` | +| `MigrationPrepareTarget`, `TargetPreparing`, `DisksPreparing` | `Migration is in progress: target pod is being scheduled and prepared.` | +| `MigrationTargetReady`, `Syncing`, `SourceSuspended`, `TargetResumed` | `Migration is in progress: source and target are being synchronized.` | + +## Practical examples + +| Scenario | VMOP phase | Reason | Progress | +|---|---|---|---:| +| Migration object exists but KubeVirt has not started scheduling | `Pending` | `MigrationPending` | `0%` | +| Target pod scheduling starts | `InProgress` | `TargetScheduling` | `2%` | +| Target pod cannot be scheduled | `InProgress` or `Failed` | `TargetUnschedulable` | `2%` | +| Target pod has volume attach errors | `InProgress` or `Failed` | `TargetDiskError` | `3%` | +| Syncing without iteration metrics | `InProgress` | `Syncing` | `10..45%` | +| Syncing with iteration metrics | `InProgress` | `Syncing` | `45..90%` | +| Sync stalls at maximum throttle | `InProgress` or `Failed` | `NotConverging` | dynamic | +| Migration is aborted | `Failed` | `Aborted` | keep current | +| Unknown failure | `Failed` | `Failed` | keep current | +| Source suspended during final handoff | `InProgress` | `SourceSuspended` | `91%` | +| Target resumed | `InProgress` | `TargetResumed` | `92%` | +| Migration succeeds | `Completed` | `Completed` | `100%` | From 657c831e85cbe3d15ec2db17ec8829996dd25e0d Mon Sep 17 00:00:00 2001 From: Daniil Antoshin Date: Mon, 27 Apr 2026 16:32:37 +0200 Subject: [PATCH 4/4] docs(docs): add russian vmop migration lifecycle Signed-off-by: Daniil Antoshin --- docs/internal/vmop_migration_lifecycle.ru.md | 237 +++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 docs/internal/vmop_migration_lifecycle.ru.md diff --git a/docs/internal/vmop_migration_lifecycle.ru.md b/docs/internal/vmop_migration_lifecycle.ru.md new file mode 100644 index 0000000000..0e3e7b0885 --- /dev/null +++ b/docs/internal/vmop_migration_lifecycle.ru.md @@ -0,0 +1,237 @@ +# Жизненный цикл, причины и прогресс миграции VMOP + +Этот документ описывает внутреннюю модель статусов, используемую при обработке миграции `VirtualMachineOperation`. +Документ предназначен для сопровождающих контроллеров и ревьюеров. + +## Область применения + +Контроллер миграции VMOP показывает состояние миграции через: + +- `status.phase` +- `status.progress` +- `status.conditions[Completed].reason` +- `status.conditions[Completed].message` + +Контроллер не копирует фазы миграции KubeVirt один к одному. Он преобразует фазы KubeVirt, состояние целевого pod, состояние миграции и метрики передачи в пользовательские причины VMOP и значения прогресса. + +## Жизненный цикл выполнения + +| Состояние миграции KubeVirt | Причина VMOP Completed | Фаза VMOP | Прогресс | +|---|---|---|---:| +| `MigrationPhaseUnset` | `MigrationPending` | `Pending` | `0%` | +| `MigrationPending` | `MigrationPending` | `Pending` | `0%` | +| `MigrationScheduling` | `TargetScheduling` | `InProgress` | `2%` | +| целевой pod не может быть запланирован | `TargetUnschedulable` | `InProgress` | `2%` | +| `MigrationScheduled` | `TargetPreparing` | `InProgress` | `3%` | +| `MigrationPreparingTarget` | `TargetPreparing` | `InProgress` | `3%` | +| у целевого pod есть ошибки подключения или монтирования дисков | `TargetDiskError` | `InProgress` | `3%` | +| `MigrationTargetReady` | `Syncing` | `InProgress` | динамический, `10..90%` | +| `MigrationWaitingForSync` | `Syncing` | `InProgress` | динамический, `10..90%` | +| `MigrationSynchronizing` | `Syncing` | `InProgress` | динамический, `10..90%` | +| `MigrationRunning` | `Syncing` | `InProgress` | динамический, `10..90%` | +| прогресс синхронизации останавливается при максимальном throttling | `NotConverging` | `InProgress` | динамический, `10..90%` | +| `MigrationState.Completed == true` | `SourceSuspended` | `InProgress` | `91%` | +| `TargetNodeDomainReadyTimestamp != nil` | `TargetResumed` | `InProgress` | `92%` | +| `MigrationSucceeded` | `Completed` | `Completed` | `100%` | + +## Семантика причин + +| Причина | Значение | +|---|---| +| `MigrationPending` | Объект миграции существует или скоро будет обработан, но планирование целевого pod еще не началось. | +| `TargetScheduling` | Планирование целевого pod началось. Операция уже активна, поэтому фаза VMOP — `InProgress`. | +| `TargetUnschedulable` | Целевой pod находится в состоянии pending и имеет Kubernetes-условие `PodScheduled=False, Unschedulable`. | +| `TargetPreparing` | Целевой pod был запланирован и подготавливается. | +| `TargetDiskError` | Целевой pod заблокирован из-за проблемы с диском, volume, CSI, подключением или монтированием. | +| `Syncing` | Источник и цель синхронизируют данные миграции. | +| `NotConverging` | Фаза синхронизации не показывает достаточного прогресса для сходимости. | +| `SourceSuspended` | Исходная VM была приостановлена в рамках финальной передачи миграции. | +| `TargetResumed` | Целевая VM возобновила работу на целевом узле. | +| `Completed` | Миграция успешно завершена. | +| `Aborted` | Миграция была прервана. | +| `Failed` | Миграция завершилась ошибкой без уточненной причины. | + +## Модель прогресса + +Фиксированные значения прогресса: + +| Причина | Прогресс | +|---|---:| +| `MigrationPending` | `0%` | +| `DisksPreparing` | `1%` | +| `TargetScheduling` | `2%` | +| `TargetUnschedulable` | `2%` | +| `TargetPreparing` | `3%` | +| `TargetDiskError` | `3%` | +| `SourceSuspended` | `91%` | +| `TargetResumed` | `92%` | +| `Completed` | `100%` | + +Динамические значения прогресса: + +| Причина | Источник прогресса | +|---|---| +| `Syncing` | `migration/internal/progress.Progress.SyncProgress` | +| `NotConverging` | `migration/internal/progress.Progress.SyncProgress` | + +Поведение при fallback: + +| Причина | Поведение прогресса | +|---|---| +| `Aborted` | Сохранить текущее значение `vmop.status.progress`. | +| `Failed` | Сохранить текущее значение `vmop.status.progress`. | +| неизвестная причина | Сохранить текущее значение `vmop.status.progress`. | + +## Детали прогресса синхронизации + +Прогресс синхронизации намеренно консервативен и использует диапазон `10..90%`. + +Стратегия синхронизации состоит из двух стадий: + +| Стадия | Диапазон | Как выбирается | +|---|---:|---| +| bulk-стадия | `10..45%` | Стадия синхронизации по умолчанию, когда метрики итеративной миграции недоступны. | +| итеративная стадия | `45..90%` | Включается только при `transferStatus.iteration > 0`. | + +Контроллер строит запись прогресса из: + +- `migrationState.startTimestamp` +- `migrationState.mode` +- `migrationState.transferStatus.iteration` +- `migrationState.transferStatus.autoConvergeThrottle` +- `migrationState.transferStatus.dataTotalBytes` +- `migrationState.transferStatus.dataProcessedBytes` +- `migrationState.transferStatus.dataRemainingBytes` +- `migrationState.migrationConfiguration.allowAutoConverge` +- предыдущего значения `vmop.status.progress` + +Если `transferStatus.iteration` отсутствует или равен нулю, миграция остается в bulk-стадии, а прогресс ограничивается `45%`, пока жизненный цикл не перейдет к `SourceSuspended`, `TargetResumed` или `Completed`. + +Это означает, что многие быстрые миграции или миграции с неполными метриками могут визуально оставаться ниже `50%`, а затем перейти сразу к `91%`, `92%` или `100%`. + +## Обнаружение NotConverging + +`NotConverging` может появиться двумя способами. + +### Обнаружение во время выполнения + +Во время `Syncing` контроллер вызывает `progressStrategy.IsNotConverging(record)`. + +Стратегия возвращает `true` только при выполнении всех условий: + +1. Для VMOP есть сохраненное состояние прогресса. +2. Миграция вошла в итеративную стадию. +3. Миграция находится на максимальном throttling: + - если auto-converge отключен, это считается максимальным throttling; + - если auto-converge включен, требуется `transferStatus.autoConvergeThrottle >= 99`. +4. Минимальное наблюдаемое количество оставшихся данных не улучшалось как минимум `10s`. + +Когда это происходит, VMOP остается в фазе `InProgress`, но `Completed.reason` становится `NotConverging`, а сообщение становится: + +```text +Migration is not converging: data remaining is not decreasing at maximum throttle +``` + +### Обнаружение терминальной ошибки + +Когда KubeVirt сообщает `MigrationFailed`, `getFailedReason` также сопоставляет ошибку с `NotConverging`, если `migrationState.failureReason` содержит: + +- `converg` +- `progress` + +Если терминальная ошибка в остальном общая, но предыдущий `Completed.reason` у VMOP был `NotConverging`, контроллер сохраняет `NotConverging` как финальную причину. + +## Обработка неуспешной миграции + +Когда KubeVirt сообщает `MigrationFailed`, контроллер устанавливает: + +- `vmop.status.phase = Failed` +- `conditions[Completed].status = False` +- `conditions[Completed].reason = <классифицированная причина>` +- `status.progress = <прогресс, зависящий от причины>` + +Порядок классификации причины ошибки: + +| Приоритет | Условие | Финальная причина | Прогресс | +|---:|---|---|---:| +| 1 | `migrationState.abortRequested == true` или `abortStatus == MigrationAbortSucceeded` | `Aborted` | сохранить текущий | +| 2 | `migrationState.failureReason` содержит `converg` или `progress` | `NotConverging` | динамический, `10..90%` | +| 3 | причина или сообщение failed-условия содержит `schedul` или `unschedul` | `TargetUnschedulable` | `2%` | +| 4 | причина или сообщение failed-условия содержит `csi`, `attach`, `volume` или `disk` | `TargetDiskError` | `3%` | +| 5 | нет специфичного совпадения | `Failed` | сохранить текущий | + +Базовое сообщение ошибки по причине: + +| Причина | Базовое сообщение | +|---|---| +| `Aborted` | `Migration aborted` | +| `NotConverging` | `Migration did not converge` | +| `TargetUnschedulable` | `Migration failed: target pod is unschedulable` | +| `TargetDiskError` | `Migration failed: target disk attach error` | +| `Failed` | `Migration failed` | + +Контроллер добавляет к базовому сообщению дополнительные детали из: + +1. `migrationState.failureReason`, если он есть; +2. иначе — из сообщения условия KubeVirt `VirtualMachineInstanceMigrationFailed`, если оно есть. + +## Диагностика целевого pod + +Пока миграция выполняется, диагностика целевого pod может переопределить причину, основанную на фазе. + +### Целевой pod не может быть запланирован + +Если целевой pod находится в pending и имеет: + +```text +PodScheduled=False, Reason=Unschedulable +``` + +то VMOP использует: + +- причина: `TargetUnschedulable` +- прогресс: `2%` +- сообщение: `Target pod "/" is unschedulable` + +### Ошибка подключения или монтирования диска + +Если целевой pod находится в состоянии создания контейнера, а warning events содержат: + +- `FailedAttachVolume` +- `FailedMount` + +то VMOP использует: + +- причина: `TargetDiskError` +- прогресс: `3%` +- сообщение: `Target pod has disk attach error: : ` + +## Проекция условия VM + +Контроллер VM проецирует причины миграции VMOP в условие VM `Migrating`. + +Важные сообщения: + +| Причина VMOP | Сообщение VM | +|---|---| +| `MigrationPending` | `Migration is awaiting start.` | +| `TargetScheduling` | `Migration is in progress: target pod is being scheduled.` | +| `MigrationPrepareTarget`, `TargetPreparing`, `DisksPreparing` | `Migration is in progress: target pod is being scheduled and prepared.` | +| `MigrationTargetReady`, `Syncing`, `SourceSuspended`, `TargetResumed` | `Migration is in progress: source and target are being synchronized.` | + +## Практические примеры + +| Сценарий | Фаза VMOP | Причина | Прогресс | +|---|---|---|---:| +| Объект миграции существует, но KubeVirt еще не начал планирование | `Pending` | `MigrationPending` | `0%` | +| Планирование целевого pod началось | `InProgress` | `TargetScheduling` | `2%` | +| Целевой pod не может быть запланирован | `InProgress` или `Failed` | `TargetUnschedulable` | `2%` | +| У целевого pod есть ошибки подключения volume | `InProgress` или `Failed` | `TargetDiskError` | `3%` | +| Синхронизация без метрик итераций | `InProgress` | `Syncing` | `10..45%` | +| Синхронизация с метриками итераций | `InProgress` | `Syncing` | `45..90%` | +| Синхронизация останавливается при максимальном throttling | `InProgress` или `Failed` | `NotConverging` | динамический | +| Миграция прервана | `Failed` | `Aborted` | сохранить текущий | +| Неизвестная ошибка | `Failed` | `Failed` | сохранить текущий | +| Источник приостановлен во время финальной передачи | `InProgress` | `SourceSuspended` | `91%` | +| Цель возобновлена | `InProgress` | `TargetResumed` | `92%` | +| Миграция успешно завершена | `Completed` | `Completed` | `100%` |