Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion ocaml/xapi/xapi_vm_lifecycle.ml
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~op ~ref ~strict =
let some_err e = Some (e, [Ref.string_of ref]) in
let lack_feature feature = not (has_feature ~vmgmr ~feature) in
match op with
| (`suspend | `checkpoint | `pool_migrate | `migrate_send) when is_live -> (
| (`pool_migrate | `migrate_send) when is_live -> (

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I understand the split but don't we have the same problem of false negatives here?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually the migration will do the suspend also.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this, migrations are still blocked in xapi, while suspend is allowed.

There's already an assert_can_save in VM_migrate in xenopsd, should it not be moved together with suspend and checkpoint here?

match get_feature ~vmgmr ~feature:"data-cant-suspend-reason" with
| Some reason ->
Some (Api_errors.vm_non_suspendable, [Ref.string_of ref; reason])
Expand All @@ -215,6 +215,18 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~op ~ref ~strict =
| None ->
None
)
| (`suspend | `checkpoint) when is_live ->
(* Cannot gate suspend/checkpoint on the cached data-cant-suspend-reason:
that xenstore key is set by the QMP event thread whenever a QMP event
arrives and Query_migratable returns an error (e.g. NVMe has transient
in-flight I/O). By the time the actual suspend executes, the device may
be idle. xenopsd performs a fresh Query_migratable check via
assert_can_save before committing to the save sequence, which is the
authoritative check. *)
Comment on lines +219 to +225

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that originally the xapi uses the data-cant-suspend-reason in guest metrics to gate for suspend, checkpoint, pool_migrate and migrate_send. Meanwhile, the migration (both pool_migrate and migrate_send) implementation in xenopsd uses B.VM.assert_can_save to gate again.

This change works only for the value in xenstore being cleaned up between xapi gate and xenopsd gate? If this is true, I would think a retry on client side is fine.

if (not implicit_support) && strict && lack_feature "feature-suspend" then
some_err Api_errors.vm_lacks_feature
else
None
| _ when implicit_support ->
None
| `clean_shutdown
Expand Down
7 changes: 7 additions & 0 deletions ocaml/xenopsd/lib/xenops_server.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2307,6 +2307,13 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
VM_DB.signal id
| VM_suspend (id, _data) ->
debug "VM.suspend %s" id ;
(* Do a fresh Query_migratable before committing to the suspend sequence.
The stale data/cant_suspend_reason in xenstore (written by the QMP event
thread on transient NVMe in-flight I/O) may have already been cleared by
the time we reach here; this authoritative check ensures we only fail if
the device is genuinely non-migratable at suspend time. *)
let module B = (val get_backend () : S) in
B.VM.assert_can_save (VM_DB.read_exn id) ;
perform_atomics (atomics_of_operation op) t ;
VM_DB.signal id
| VM_restore_vifs id ->
Expand Down
4 changes: 4 additions & 0 deletions ocaml/xenopsd/lib/xenops_server_simulator.ml
Original file line number Diff line number Diff line change
Expand Up @@ -598,12 +598,16 @@ module VM = struct

let wait_shutdown _ _vm _reason _timeout = true

let assert_can_save _vm = ()

let save _ _cb vm flags data vgpu_data _pre_suspend_callback =
with_lock m (save_nolock vm flags data vgpu_data)

let restore _ _cb vm vbds vifs data vgpu_data extras =
with_lock m (restore_nolock vm vbds vifs data vgpu_data extras)

let resume _ _vm = ()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you add this?


let s3suspend _ _vm = ()

let s3resume _ _vm = ()
Expand Down
Loading