Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion internal/scheduling/reservations/failover/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,28 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
allHypervisors = append(allHypervisors, hv.Name)
}

// Build a set of VM UUIDs currently active on any known hypervisor.
// This is used as a safeguard against removing failover allocations when the
// VM source (postgres) is missing data but the VM is still alive on a
// hypervisor (e.g. after a postgres data loss / restore). Without this
// cross-check a wiped postgres would cause every failover reservation to be
// emptied and then deleted.
vmsOnHypervisor := make(map[string]string)
for _, hv := range hypervisorList.Items {
for _, inst := range hv.Status.Instances {
if !inst.Active {
continue
}
if _, exists := vmsOnHypervisor[inst.ID]; exists {
// VM appears on multiple hypervisors (transient during live
// migration). Keep the first occurrence; the safeguard only
// needs to know it exists somewhere.
continue
}
vmsOnHypervisor[inst.ID] = hv.Name
}
}

// 2. Get all VMs that might need failover reservations
vms, err := c.VMSource.ListVMsOnHypervisors(ctx, &hypervisorList, c.Config.TrustHypervisorLocation)
if err != nil {
Expand All @@ -289,7 +311,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
logger.V(1).Info("found failover reservations", "count", len(failoverReservations))

// 3. Remove VMs from reservations if they are no longer valid
failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, failoverReservations)
failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, vmsOnHypervisor, failoverReservations)

for _, res := range reservationsToUpdate {
if err := c.patchReservationStatus(ctx, res); err != nil {
Expand Down Expand Up @@ -381,9 +403,17 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
// - The VM has moved to a different host
// Returns the updated list of reservations (with modifications applied in-memory).
// The caller is responsible for persisting any changes to the cluster.
//
// vmsOnHypervisor maps VM UUID -> hypervisor name for every VM currently active
// on any known hypervisor (sourced from hv1.HypervisorList.Status.Instances).
// It is used as a safeguard: if a VM is missing from the postgres-derived vms
// slice but is still present on a hypervisor, we keep the allocation. This
// protects failover reservations from being wiped by transient/total postgres
// data loss.
func reconcileRemoveInvalidVMFromReservations(
ctx context.Context,
vms []VM,
vmsOnHypervisor map[string]string,
failoverReservations []v1alpha1.Reservation,
) (updatedReservations []v1alpha1.Reservation, reservationsToUpdate []*v1alpha1.Reservation) {

Expand All @@ -404,6 +434,18 @@ func reconcileRemoveInvalidVMFromReservations(
for vmUUID, allocatedHypervisor := range allocations {
vmCurrentHypervisor, vmExists := vmToHypervisor[vmUUID]
if !vmExists {
// Safeguard: if the VM is missing from the VM source (e.g.
// postgres) but is still reported active on a hypervisor by
// the hypervisor operator CRD, keep the allocation. This
// prevents a postgres data loss from cascading into a mass
// deletion of failover reservations.
if hv, stillOnHV := vmsOnHypervisor[vmUUID]; stillOnHV {
logger.Info("keeping VM allocation despite missing from VM source: still active on hypervisor",
"vmUUID", vmUUID, "reservation", res.Name,
"allocatedHypervisor", allocatedHypervisor, "hypervisor", hv)
updatedAllocations[vmUUID] = allocatedHypervisor
continue
}
logger.Info("removing VM from reservation allocations because VM no longer exists",
"vmUUID", vmUUID, "reservation", res.Name)
needsUpdate = true
Expand Down
109 changes: 105 additions & 4 deletions internal/scheduling/reservations/failover/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
tests := []struct {
name string
vms []VM
vmsOnHypervisor map[string]string
reservations []v1alpha1.Reservation
expectedUpdatedCount int // number of reservations in updatedReservations
expectedToUpdateCount int // number of reservations that need cluster update
Expand Down Expand Up @@ -604,7 +605,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
name: "VM no longer exists - remove from allocations",
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"),
// vm-2 no longer exists
// vm-2 no longer exists (and not on any hypervisor)
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host3", map[string]string{
Expand Down Expand Up @@ -641,7 +642,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"),
newTestVM("vm-2", "host2", "flavor1"),
// vm-3 no longer exists
// vm-3 no longer exists (and not on any hypervisor)
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host3", map[string]string{
Expand All @@ -662,7 +663,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
{
name: "all VMs removed from reservation - empty allocations",
vms: []VM{
// no VMs exist
// no VMs exist (and not on any hypervisor)
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host3", map[string]string{
Expand Down Expand Up @@ -705,7 +706,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"), // valid
newTestVM("vm-2", "host5", "flavor1"), // moved from host2 to host5
// vm-3 deleted
// vm-3 deleted (and not on any hypervisor)
newTestVM("vm-4", "host4", "flavor1"), // valid
},
reservations: []v1alpha1.Reservation{
Expand All @@ -725,6 +726,105 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
"res-2": {"vm-4": "host4"},
},
},
// ====================================================================
// Safeguard: VM missing from VM source but still on a hypervisor
// (e.g. postgres data loss). Allocations must be preserved.
// ====================================================================
{
name: "safeguard: VM missing from VM source but still on hypervisor - keep allocation",
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"),
// vm-2 missing from VM source (postgres lost data)
},
vmsOnHypervisor: map[string]string{
"vm-1": "host1",
"vm-2": "host2", // still alive on hypervisor
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host3", map[string]string{
"vm-1": "host1",
"vm-2": "host2",
}),
},
expectedUpdatedCount: 1,
expectedToUpdateCount: 0, // safeguard prevents the update
expectedAllocationsPerRes: map[string]map[string]string{
"res-1": {"vm-1": "host1", "vm-2": "host2"},
},
},
{
name: "safeguard: VM source completely empty but VMs still on hypervisors - keep all allocations",
vms: []VM{
// VM source returns nothing (postgres wiped)
},
vmsOnHypervisor: map[string]string{
"vm-1": "host1",
"vm-2": "host2",
"vm-3": "host3",
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host4", map[string]string{
"vm-1": "host1",
"vm-2": "host2",
}),
newTestReservation("res-2", "host5", map[string]string{
"vm-3": "host3",
}),
},
expectedUpdatedCount: 2,
expectedToUpdateCount: 0, // safeguard prevents both updates
expectedAllocationsPerRes: map[string]map[string]string{
"res-1": {"vm-1": "host1", "vm-2": "host2"},
"res-2": {"vm-3": "host3"},
},
},
{
name: "safeguard: only some missing VMs are still on hypervisor - remove only fully gone ones",
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"),
// vm-2 missing from postgres but on hypervisor
// vm-3 missing from both
},
vmsOnHypervisor: map[string]string{
"vm-1": "host1",
"vm-2": "host2",
// vm-3 not on any hypervisor
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host4", map[string]string{
"vm-1": "host1",
"vm-2": "host2", // safeguarded
"vm-3": "host3", // truly gone - remove
}),
},
expectedUpdatedCount: 1,
expectedToUpdateCount: 1,
expectedAllocationsPerRes: map[string]map[string]string{
"res-1": {"vm-1": "host1", "vm-2": "host2"},
},
},
{
name: "safeguard does not protect against hypervisor mismatch (VM is in postgres on different host)",
vms: []VM{
newTestVM("vm-1", "host1", "flavor1"),
newTestVM("vm-2", "host5", "flavor1"), // moved to host5 in postgres
},
vmsOnHypervisor: map[string]string{
"vm-1": "host1",
"vm-2": "host5",
},
reservations: []v1alpha1.Reservation{
newTestReservation("res-1", "host3", map[string]string{
"vm-1": "host1",
"vm-2": "host2", // host mismatch - should still be removed
}),
},
expectedUpdatedCount: 1,
expectedToUpdateCount: 1,
expectedAllocationsPerRes: map[string]map[string]string{
"res-1": {"vm-1": "host1"},
},
},
}

for _, tt := range tests {
Expand All @@ -733,6 +833,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
updatedReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(
ctx,
tt.vms,
tt.vmsOnHypervisor,
tt.reservations,
)

Expand Down