From 4db7dfca006415d7621f8b92ed9ca1ee87497fca Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 10:32:19 +0530 Subject: [PATCH 01/19] Add gate to ensure preservation code is only run for machines with preservation state. --- .../provider/machinecontroller/machine.go | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index fc05c38c76..19fa87de13 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -746,6 +746,11 @@ func (c *controller) isCreationProcessing(machine *v1alpha1.Machine) bool { // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1alpha1.Machine) (retry machineutils.RetryPeriod, err error) { + var preservationBound bool + preservationBound, err = c.isMachinePreservationBound(ctx, machine) + if !preservationBound { + return + } machineAnnotationsSynced := false clone := machine.DeepCopy() defer func() { @@ -841,3 +846,32 @@ func getEffectivePreservationAnnotations(nodeAnnotationValue string, machineAnno clonedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeAnnotationValue return nodeAnnotationValue, clonedMachineAnnotations } + +func (c *controller) isMachinePreservationBound(ctx context.Context, machine *v1alpha1.Machine) (bool, error) { + var err error + var machineAnnotated, nodeAnnotated bool + var laNodeAV string + var node *corev1.Node + + if machine.Annotations != nil { + _, machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] + laNodeAV = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] + } + + nodeName := machine.Labels[v1alpha1.NodeLabelKey] + if nodeName != "" { + node, err = c.nodeLister.Get(nodeName) + if err != nil { + if apierrors.IsNotFound(err) { + err = nil + } + } else { + _, nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] + } + } + // if machine has no preservation state, the machine is not preservation-bound + if machine.Status.CurrentStatus.PreserveExpiryTime == nil && !nodeAnnotated && !machineAnnotated && laNodeAV == "" { + return false, err + } + return true, err +} From 60af6199e606073384b4b898cd1ea352246a9887 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 10:33:52 +0530 Subject: [PATCH 02/19] Uncordon node only if Machine is preservation-bound, and in Running phase. Ignore node not found errors. --- pkg/util/provider/machinecontroller/machine.go | 2 +- pkg/util/provider/machinecontroller/node.go | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 19fa87de13..fb522e871e 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -826,7 +826,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } // This is to handle the case where a preserved machine recovers from Failed to Running // in which case, pods should be allowed to be scheduled onto the node - if machineutils.IsMachineActive(clone) && nodeName != "" { + if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { err = c.uncordonNodeIfCordoned(ctx, nodeName) } return diff --git a/pkg/util/provider/machinecontroller/node.go b/pkg/util/provider/machinecontroller/node.go index 5a2f77ff94..ed54d5890e 100644 --- a/pkg/util/provider/machinecontroller/node.go +++ b/pkg/util/provider/machinecontroller/node.go @@ -355,6 +355,9 @@ func (c *controller) getNodePreserveAnnotationValue(nodeName string) (string, er func (c *controller) uncordonNodeIfCordoned(ctx context.Context, nodeName string) error { node, err := c.nodeLister.Get(nodeName) if err != nil { + if apierrors.IsNotFound(err) { + return nil + } return err } if !node.Spec.Unschedulable { From cea40c2da4e2babefa51aabb63278119c3f2b85b Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 11:01:11 +0530 Subject: [PATCH 03/19] Return updated machine object from preservation functions to prevent conflicts. --- .../provider/machinecontroller/machine.go | 21 ++++---- .../machinecontroller/machine_util.go | 52 +++++++++---------- .../machinecontroller/machine_util_test.go | 2 +- 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index fb522e871e..7145c732c0 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -751,13 +751,10 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if !preservationBound { return } - machineAnnotationsSynced := false clone := machine.DeepCopy() defer func() { - // This needs to be done for cases when machine is neither preserved nor un-preserved (e.g. when preserve changes from now to when-failed on a Failed machine), - // but the LastAppliedNodePreserveValueAnnotation needs to be synced to the machine object. // We compare annotation value in the clone with the original machine object to see if an update is required - if err == nil && !machineAnnotationsSynced && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { + if err == nil && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { _, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) if err != nil { klog.Errorf("error updating LastAppliedNodePreserveValueAnnotation value on machine %q: %v", machine.Name, err) @@ -794,31 +791,31 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) + clone, err = c.stopPreservationIfActive(ctx, clone, false) case machineutils.PreserveMachineAnnotationValueWhenFailed: // on timing out, remove preserve annotation to prevent incorrect re-preservation if machineutils.IsMachinePreservationExpired(clone) { - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + clone, err = c.stopPreservationIfActive(ctx, clone, true) } else if !machineutils.IsMachineFailed(clone) { - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) + clone, err = c.stopPreservationIfActive(ctx, clone, false) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } case machineutils.PreserveMachineAnnotationValueNow: if machineutils.IsMachinePreservationExpired(clone) { // on timing out, remove preserve annotation to prevent incorrect re-preservation - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + clone, err = c.stopPreservationIfActive(ctx, clone, true) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } case machineutils.PreserveMachineAnnotationValuePreservedByMCM: if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) { // To prevent incorrect re-preservation of a recovered, previously auto-preserved machine on future failures // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), // in addition to stopping preservation, we also remove the preservation annotation on the machine. - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + clone, err = c.stopPreservationIfActive(ctx, clone, true) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } } if err != nil { diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 7c08640b5c..8cd15805cb 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2333,7 +2333,7 @@ Utility Functions for Machine Preservation */ // preserveMachine contains logic to start the preservation of a machine and node. -func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Machine, preserveValue string) error { +func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Machine, preserveValue string) (*v1alpha1.Machine, error) { var err error nodeName := machine.Labels[v1alpha1.NodeLabelKey] if machine.Status.CurrentStatus.PreserveExpiryTime == nil { @@ -2341,29 +2341,29 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach // Step 1: Add preserveExpiryTime to machine status machine, err = c.setPreserveExpiryTimeOnMachine(ctx, machine) if err != nil { - return err + return nil, err } } if nodeName == "" { // Machine has no backing node, preservation is complete klog.V(2).Infof("Machine %q without backing node is preserved successfully till %v.", machine.Name, machine.Status.CurrentStatus.PreserveExpiryTime) - return nil + return machine, nil } // Machine has a backing node node, err := c.nodeLister.Get(nodeName) if err != nil { klog.Errorf("error trying to get node %q of machine %q: %v. Retrying.", nodeName, machine.Name, err) - return err + return nil, err } existingNodePreservedCondition := nodeops.GetCondition(node, v1alpha1.NodePreserved) // checks if preservation is already complete if existingNodePreservedCondition != nil && existingNodePreservedCondition.Status == v1.ConditionTrue { - return nil + return machine, nil } // Step 2: Add annotations to prevent scale down of node by CA updatedNode, err := c.addCAScaleDownDisabledAnnotationOnNode(ctx, node) if err != nil { - return err + return nil, err } var drainErr error if shouldPreservedNodeBeDrained(existingNodePreservedCondition, machine.Status.CurrentStatus.Phase) { @@ -2376,41 +2376,41 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach _, err = nodeops.AddOrUpdateConditionsOnNode(ctx, c.targetCoreClient, updatedNode.Name, *newCond) if drainErr != nil { klog.Errorf("error draining preserved node %q for machine %q : %v", nodeName, machine.Name, drainErr) - return drainErr + return nil, drainErr } if err != nil { klog.Errorf("error trying to update node preserved condition for node %q of machine %q : %v", nodeName, machine.Name, err) - return err + return nil, err } } - klog.V(2).Infof("Machine %q and backing node preserved successfully till %v.", machine.Name, machine.Status.CurrentStatus.PreserveExpiryTime) - return nil + klog.V(2).Infof("Machine %q and backing node %q preserved successfully till %v.", machine.Name, nodeName, machine.Status.CurrentStatus.PreserveExpiryTime) + return machine, nil } // stopPreservationIfActive stops the preservation of the machine and node, if preserved, and returns true if machine object has been updated -func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1alpha1.Machine, removePreservationAnnotations bool) (bool, error) { +func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1alpha1.Machine, removePreservationAnnotations bool) (*v1alpha1.Machine, error) { + var err error // removal of preserveExpiryTime is the last step of stopping preservation // therefore, if preserveExpiryTime is not set, machine is not preserved nodeName := machine.Labels[v1alpha1.NodeLabelKey] if machine.Status.CurrentStatus.PreserveExpiryTime == nil { - return false, nil + return machine, nil } // if there is no backing node if nodeName == "" { // remove annotation from machine if needed if removePreservationAnnotations { - var err error machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) if err != nil { - return false, err + return nil, err } } - machine, err := c.clearMachinePreserveExpiryTime(ctx, machine) + machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q with no backing node has stopped.", machine.Name) - return true, nil + return machine, nil } // Machine has a backing node @@ -2421,15 +2421,15 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al if apierrors.IsNotFound(err) { // Node not found, proceed to clear preserveExpiryTime on machine klog.Warningf("Node %q of machine %q not found. Proceeding to clear PreserveExpiryTime on machine.", nodeName, machine.Name) - _, err := c.clearMachinePreserveExpiryTime(ctx, machine) + machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q has stopped.", machine.Name) - return true, nil + return machine, nil } klog.Errorf("error trying to get node %q of machine %q: %v. Retrying.", nodeName, machine.Name, err) - return false, err + return nil, err } // prepare NodeCondition to set preservation as stopped preservedConditionFalse := v1.NodeCondition{ @@ -2441,27 +2441,27 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al // Step 1: change node condition to reflect that preservation has stopped updatedNode, err := nodeops.AddOrUpdateConditionsOnNode(ctx, c.targetCoreClient, node.Name, preservedConditionFalse) if err != nil { - return false, err + return nil, err } // Step 2: remove annotations from node err = c.removePreservationRelatedAnnotationsOnNode(ctx, updatedNode, removePreservationAnnotations) if err != nil { - return false, err + return nil, err } // Step 3: remove annotation from machine if needed if removePreservationAnnotations { machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) if err != nil { - return false, err + return nil, err } } // Step 4: update machine status to set preserve expiry time to nil machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q with backing node %q has stopped.", machine.Name, nodeName) - return true, nil + return machine, nil } // setPreserveExpiryTimeOnMachine sets the PreserveExpiryTime on the machine object's Status.CurrentStatus to now + preserve timeout diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index c180ddc44d..4813551f41 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -4024,7 +4024,7 @@ var _ = Describe("machine_util", func() { c, trackers := createController(stop, testNamespace, controlMachineObjects, nil, targetCoreObjects, nil, false) defer trackers.Stop() waitForCacheSync(stop, c) - err := c.preserveMachine(context.TODO(), machine, tc.setup.preserveValue) + _, err := c.preserveMachine(context.TODO(), machine, tc.setup.preserveValue) if tc.expect.err == nil { Expect(err).To(BeNil()) } else { From de02bfd29197dd94d290953cf02c63cc2b6c8db8 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 12:02:45 +0530 Subject: [PATCH 04/19] Handle IsNotFound error in the defer function. --- pkg/util/provider/machinecontroller/machine.go | 5 +++++ pkg/util/provider/machinecontroller/machine_test.go | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 7145c732c0..f219736dbd 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -761,6 +761,11 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } } if err != nil { + if apierrors.IsNotFound(err) { + klog.Warningf("Error during preservation flow:%v", err.Error()) + retry = machineutils.LongRetry + err = nil + } if apierrors.IsConflict(err) { retry = machineutils.ConflictRetry } else { diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 15f98f5098..9e9045a082 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4392,7 +4392,7 @@ var _ = Describe("machine", func() { nodeCondition: nil, machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, retry: machineutils.ShortRetry, - err: fmt.Errorf("node %q not found", "invalid"), + err: nil, }, }), Entry("when auto-preserved machine moves to Running, should stop preservation and remove auto-preserve annotation", testCase{ From a3b7133c25573345860abc8ca89d2ca052801c5e Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 13:00:38 +0530 Subject: [PATCH 05/19] Fix tests and defer function. --- .../provider/machinecontroller/machine.go | 20 +++++++++++-------- .../machinecontroller/machine_test.go | 6 +++--- .../machinecontroller/machine_util.go | 10 ++++++++-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index f219736dbd..e96ed138d5 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -752,9 +752,10 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a return } clone := machine.DeepCopy() + var removeAnnotations bool defer func() { // We compare annotation value in the clone with the original machine object to see if an update is required - if err == nil && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { + if err == nil && !removeAnnotations && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { _, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) if err != nil { klog.Errorf("error updating LastAppliedNodePreserveValueAnnotation value on machine %q: %v", machine.Name, err) @@ -765,8 +766,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a klog.Warningf("Error during preservation flow:%v", err.Error()) retry = machineutils.LongRetry err = nil - } - if apierrors.IsConflict(err) { + } else if apierrors.IsConflict(err) { retry = machineutils.ConflictRetry } else { retry = machineutils.ShortRetry @@ -796,20 +796,23 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: - clone, err = c.stopPreservationIfActive(ctx, clone, false) + + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) case machineutils.PreserveMachineAnnotationValueWhenFailed: // on timing out, remove preserve annotation to prevent incorrect re-preservation if machineutils.IsMachinePreservationExpired(clone) { - clone, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else if !machineutils.IsMachineFailed(clone) { - clone, err = c.stopPreservationIfActive(ctx, clone, false) + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } case machineutils.PreserveMachineAnnotationValueNow: if machineutils.IsMachinePreservationExpired(clone) { // on timing out, remove preserve annotation to prevent incorrect re-preservation - clone, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } @@ -818,7 +821,8 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a // To prevent incorrect re-preservation of a recovered, previously auto-preserved machine on future failures // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), // in addition to stopping preservation, we also remove the preservation annotation on the machine. - clone, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 9e9045a082..bb53ea4325 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4180,7 +4180,7 @@ var _ = Describe("machine", func() { } else { Expect(updatedMachine.Status.CurrentStatus.PreserveExpiryTime.IsZero()).To(BeTrue()) } - if tc.setup.nodeName != "" { + if tc.setup.nodeName != "" && tc.setup.nodeName != "invalid" { updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), tc.setup.nodeName, metav1.GetOptions{}) Expect(err).ToNot(HaveOccurred()) found := false @@ -4388,10 +4388,10 @@ var _ = Describe("machine", func() { machinePhase: v1alpha1.MachineUnknown, }, expect: expect{ - preserveExpiryTimeIsSet: false, + preserveExpiryTimeIsSet: true, nodeCondition: nil, machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - retry: machineutils.ShortRetry, + retry: machineutils.LongRetry, err: nil, }, }), diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 8cd15805cb..a8f3099a52 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2419,8 +2419,14 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al // if node is not found and error is simply returned, then preservation will never be stopped on machine // therefore, this error is handled specifically if apierrors.IsNotFound(err) { - // Node not found, proceed to clear preserveExpiryTime on machine - klog.Warningf("Node %q of machine %q not found. Proceeding to clear PreserveExpiryTime on machine.", nodeName, machine.Name) + klog.Warningf("Node %q of machine %q not found. Proceeding to stop preservation on machine.", nodeName, machine.Name) + // Node not found, proceed to delete annotations and clear preserveExpiryTime on machine + if removePreservationAnnotations { + machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) + if err != nil { + return nil, err + } + } machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { return nil, err From daaa433f92f9a7ec6e61628d9b33e95911ea3926 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 13:36:32 +0530 Subject: [PATCH 06/19] Add tests and code to ensure the flow for non-preservation bound machines is not modified by `manageMachinePreservation` --- .../provider/machinecontroller/machine.go | 3 +- .../machinecontroller/machine_test.go | 171 ++++++++++++++++++ 2 files changed, 173 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index e96ed138d5..6eee345de3 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -749,7 +749,8 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a var preservationBound bool preservationBound, err = c.isMachinePreservationBound(ctx, machine) if !preservationBound { - return + // we clear the error here to prevent preservation logic from interfering with non-preservation-bound machines + return machineutils.LongRetry, nil } clone := machine.DeepCopy() var removeAnnotations bool diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index bb53ea4325..7893c15391 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4449,5 +4449,176 @@ var _ = Describe("machine", func() { }, }), ) + + DescribeTable("when machine is not preservation-bound, should return early without modifying the machine", + func(nodeUnschedulable bool, includeNode bool) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: "node-1"}, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineRunning}, + }, + } + + var targetCoreObjects []runtime.Object + if includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Spec: corev1.NodeSpec{Unschedulable: nodeUnschedulable}, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + retry, err := c.manageMachinePreservation(context.TODO(), machine) + + Expect(err).ToNot(HaveOccurred()) + Expect(retry).To(Equal(machineutils.LongRetry)) + + updatedMachine, err := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), machine.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(updatedMachine.Annotations).To(Equal(machine.Annotations)) + Expect(updatedMachine.Labels).To(Equal(machine.Labels)) + Expect(updatedMachine.Spec).To(Equal(machine.Spec)) + Expect(updatedMachine.Status).To(Equal(machine.Status)) + + if includeNode { + updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), "node-1", metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(updatedNode.Spec.Unschedulable).To(Equal(nodeUnschedulable)) + } + }, + Entry("node is cordoned", true, true), + Entry("node is uncordoned", false, true), + Entry("node retrieval fails (node not found)", false, false), + ) + }) + + Describe("#isMachinePreservationBound", func() { + type setup struct { + machineAnnotations map[string]string + preserveExpiryTime *metav1.Time + nodeName string + nodeAnnotations map[string]string + includeNode bool + } + type expect struct { + bound bool + err error + } + type testCase struct { + setup setup + expect expect + } + + DescribeTable("isMachinePreservationBound scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: tc.setup.machineAnnotations, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + PreserveExpiryTime: tc.setup.preserveExpiryTime, + }, + }, + } + + var targetCoreObjects []runtime.Object + if tc.setup.includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: tc.setup.nodeName, + Annotations: tc.setup.nodeAnnotations, + }, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + bound, err := c.isMachinePreservationBound(context.TODO(), machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(Equal(tc.expect.err.Error())) + } else { + Expect(err).ToNot(HaveOccurred()) + } + Expect(bound).To(Equal(tc.expect.bound)) + }, + Entry("machine has no annotations, no preserveExpiryTime, and node has no preservation annotation", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + }, + expect: expect{bound: false}, + }), + Entry("machine has no node label and no other preservation markers", testCase{ + setup: setup{ + nodeName: "", + }, + expect: expect{bound: false}, + }), + Entry("machine has preserve annotation set", testCase{ + setup: setup{ + nodeName: "node-1", + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine has a non-nil preserveExpiryTime", testCase{ + setup: setup{ + nodeName: "node-1", + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine's node has a preservation annotation", testCase{ + setup: setup{ + nodeName: "node-1", + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine has last-applied node preserve value annotation", testCase{ + setup: setup{ + nodeName: "node-1", + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("node is not found and machine has no preservation markers", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: false, + }, + expect: expect{bound: false}, + }), + ) }) }) From 5f143c5092803152a084bcf40a9befd286228dfc Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 13:52:05 +0530 Subject: [PATCH 07/19] Add tests to cover uncordoning of machine when machines recover to Running --- .../machinecontroller/machine_test.go | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 7893c15391..bbf500ebb7 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4101,6 +4101,7 @@ var _ = Describe("machine", func() { nodeName string machinePhase v1alpha1.MachinePhase preserveExpiryTime *metav1.Time + nodeUnschedulable bool } type expect struct { retry machineutils.RetryPeriod @@ -4109,6 +4110,7 @@ var _ = Describe("machine", func() { machineAnnotationValue string laNodePreserveValue string err error + nodeUnschedulable *bool } type testCase struct { setup setup @@ -4154,6 +4156,9 @@ var _ = Describe("machine", func() { machineutils.PreserveMachineAnnotationKey: tc.setup.nodeAnnotationValue, }, }, + Spec: corev1.NodeSpec{ + Unschedulable: tc.setup.nodeUnschedulable, + }, Status: corev1.NodeStatus{ Conditions: []corev1.NodeCondition{}, }, @@ -4198,6 +4203,9 @@ var _ = Describe("machine", func() { } else { Expect(found).To(BeFalse()) } + if tc.expect.nodeUnschedulable != nil { + Expect(updatedNode.Spec.Unschedulable).To(Equal(*tc.expect.nodeUnschedulable)) + } } Expect(updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey]).To(Equal(tc.expect.machineAnnotationValue)) Expect(updatedMachine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.laNodePreserveValue)) @@ -4448,6 +4456,80 @@ var _ = Describe("machine", func() { err: nil, }, }), + Entry("when preserve=when-failed machine with cordoned node recovers to Running, should stop preservation and uncordon node", func() testCase { + uncordoned := false + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: false, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, + machineAnnotationValue: "", + retry: machineutils.LongRetry, + nodeUnschedulable: &uncordoned, + }, + } + }()), + Entry("when preserve=when-failed machine with cordoned node has expired preservation while Running, should stop preservation and uncordon node", func() testCase { + uncordoned := false + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: false, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, + machineAnnotationValue: "", + retry: machineutils.LongRetry, + nodeUnschedulable: &uncordoned, + }, + } + }()), + Entry("when preserve=when-failed machine is Failed with active preservation and cordoned node, node should stay cordoned", func() testCase { + cordoned := true + return testCase{ + setup: setup{ + nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeName: "node-1", + machinePhase: v1alpha1.MachineFailed, + nodeUnschedulable: true, + }, + expect: expect{ + laNodePreserveValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + preserveExpiryTimeIsSet: true, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, + retry: machineutils.LongRetry, + nodeUnschedulable: &cordoned, + }, + } + }()), + Entry("when preserve=now machine is Failed with active preservation and cordoned node, node should stay cordoned", func() testCase { + cordoned := true + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + nodeName: "node-1", + machinePhase: v1alpha1.MachineFailed, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: true, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + retry: machineutils.LongRetry, + nodeUnschedulable: &cordoned, + }, + } + }()), ) DescribeTable("when machine is not preservation-bound, should return early without modifying the machine", From 4188e837bf2fb22ba320741615888237cf06fd30 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 14:16:50 +0530 Subject: [PATCH 08/19] Uncordon machines in `stopMachinePreservationIfActive` to allow retry. --- pkg/util/provider/machinecontroller/machine_test.go | 2 +- pkg/util/provider/machinecontroller/machine_util.go | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index bbf500ebb7..898a33d7ba 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4469,7 +4469,7 @@ var _ = Describe("machine", func() { expect: expect{ preserveExpiryTimeIsSet: false, nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, - machineAnnotationValue: "", + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, retry: machineutils.LongRetry, nodeUnschedulable: &uncordoned, }, diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a8f3099a52..a3f37a845f 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2461,7 +2461,13 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al return nil, err } } - // Step 4: update machine status to set preserve expiry time to nil + // Step 4: uncordon the node if the machine has recovered to Running. + if machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { + if err = c.uncordonNodeIfCordoned(ctx, nodeName); err != nil { + return nil, err + } + } + // Step 5: update machine status to set preserve expiry time to nil machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { return nil, err From 5a57dd43cda15b84381021e0f625b418217a21f4 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 14:43:02 +0530 Subject: [PATCH 09/19] Introduce struct preserveStateInfo to persist preservation state across function calls. --- pkg/util/provider/machinecontroller/machine.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 6eee345de3..f6d02c6579 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -744,6 +744,19 @@ func (c *controller) isCreationProcessing(machine *v1alpha1.Machine) bool { Machine Preservation operations */ +// preserveStateInfo encapsulates the preservation annotation values found +// on the machine and node objects, along with the effective preservation value for the machine +// and the last applied node preserve value by MCM. +type preserveStateInfo struct { + nodeAnnotated bool + machineAnnotated bool + nodeValue string + machineValue string + lastAppliedNodeValue string + PreserveExpiryTimeSet bool + effectiveValue string +} + // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1alpha1.Machine) (retry machineutils.RetryPeriod, err error) { var preservationBound bool From 0d7c29b26343ad85c9e3904656bb69669ad35c03 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 15:27:39 +0530 Subject: [PATCH 10/19] Remove "" as a valid preservation value --- pkg/controller/machineset.go | 2 +- pkg/util/provider/machineutils/utils.go | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index ebdff675ec..2425cadf49 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -949,7 +949,7 @@ func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, var others []*v1alpha1.Machine for _, m := range machines { // check if machine is already annotated for preservation, if yes, skip. Machine controller will take care of the rest. - if machineutils.IsMachineFailed(m) && !machineutils.PreventAutoPreserveAnnotationValues.Has(m.Annotations[machineutils.PreserveMachineAnnotationKey]) { + if machineutils.IsMachineFailed(m) && !machineutils.AllowedPreserveAnnotationValues.Has(m.Annotations[machineutils.PreserveMachineAnnotationKey]) { autoPreservationCandidates = append(autoPreservationCandidates, m) } else { others = append(others, m) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index b56da7cc23..4d536a3d82 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -7,6 +7,8 @@ package machineutils import ( "context" + "time" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" v1alpha1client "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1" v1alpha1listers "github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1" @@ -16,7 +18,6 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" - "time" ) const ( @@ -120,11 +121,7 @@ const ( ) // AllowedPreserveAnnotationValues contains the allowed values for the preserve annotation -var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse, "") - -// PreventAutoPreserveAnnotationValues contains the values to check if a machine is already annotated for preservation, -// in which case it should not be auto-preserved. -var PreventAutoPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse) +var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse) // RetryPeriod is an alias for specifying the retry period type RetryPeriod time.Duration From 5e258e0cb4774d558f8d55b62201e24bb70a88fc Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 15:30:56 +0530 Subject: [PATCH 11/19] Add helper functions and simplify defer --- .../provider/machinecontroller/machine.go | 149 +++++++++++------- 1 file changed, 94 insertions(+), 55 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index f6d02c6579..5d78980f61 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -754,27 +754,11 @@ type preserveStateInfo struct { machineValue string lastAppliedNodeValue string PreserveExpiryTimeSet bool - effectiveValue string } // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1alpha1.Machine) (retry machineutils.RetryPeriod, err error) { - var preservationBound bool - preservationBound, err = c.isMachinePreservationBound(ctx, machine) - if !preservationBound { - // we clear the error here to prevent preservation logic from interfering with non-preservation-bound machines - return machineutils.LongRetry, nil - } - clone := machine.DeepCopy() - var removeAnnotations bool defer func() { - // We compare annotation value in the clone with the original machine object to see if an update is required - if err == nil && !removeAnnotations && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { - _, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Errorf("error updating LastAppliedNodePreserveValueAnnotation value on machine %q: %v", machine.Name, err) - } - } if err != nil { if apierrors.IsNotFound(err) { klog.Warningf("Error during preservation flow:%v", err.Error()) @@ -789,25 +773,30 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a retry = machineutils.LongRetry } }() - - nodeName := clone.Labels[v1alpha1.NodeLabelKey] - nodeAnnotationValue := "" - if nodeName != "" { - nodeAnnotationValue, err = c.getNodePreserveAnnotationValue(nodeName) - if err != nil { - return - } + nodeName := machine.Labels[v1alpha1.NodeLabelKey] + preserveInfo, err := c.getPreserveStateInfo(machine) + if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { + klog.Warningf("Preserve annotation value %q on machine %q is invalid", preserveInfo.machineValue, machine.Name) + return } - var effectivePreserveValue string - effectivePreserveValue, clone.Annotations = getEffectivePreservationAnnotations(nodeAnnotationValue, clone.Annotations) - if !machineutils.AllowedPreserveAnnotationValues.Has(effectivePreserveValue) { - if effectivePreserveValue == nodeAnnotationValue { - klog.Warningf("Preserve annotation value %q on node %q is invalid", effectivePreserveValue, nodeName) - } else { - klog.Warningf("Preserve annotation value %q on machine %q is invalid", effectivePreserveValue, clone.Name) - } + if preserveInfo.nodeAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.nodeValue) { + klog.Warningf("Preserve annotation value %q on node %q backing machine %q is invalid", preserveInfo.nodeValue, nodeName, machine.Name) + return + } + preservationBound := c.isMachinePreservationBound(preserveInfo) + if !preservationBound { + // we clear the error here to prevent preservation logic from interfering with non-preservation-bound machines + err = nil + return + } else if err != nil { return } + + var effectivePreserveValue string + effectivePreserveValue = getEffectivePreservationAnnotations(preserveInfo) + + var removeAnnotations, preservationStopped bool + clone := machine.DeepCopy() switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: @@ -849,49 +838,99 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { err = c.uncordonNodeIfCordoned(ctx, nodeName) } + + if shouldAnnotationsBeUpdatedOnMachine(removeAnnotations, preserveInfo) { + err = c.updatePreserveAnnotationOnMachine(ctx, preserveInfo.nodeValue, clone) + } return } -// getEffectivePreservationAnnotations returns the effective preservation value, and updates the machine Annotations related to preservation -func getEffectivePreservationAnnotations(nodeAnnotationValue string, machineAnnotations map[string]string) (string, map[string]string) { +// getEffectivePreservationAnnotations returns the effective preservation value. +// +// If there is no active node annotation AND no previously-applied node annotation, +// enforce machine's preserve annotation. +// Otherwise, the node annotation takes precedence (even if now empty/removed). +// +// This is required to handle the following scenario: +// +// T1: Node and Machine both have the same annotation with the same value. (MCM is up and running). +// T2 (T2 > T1): MCM went down. +// T3 (T3 > T2): Node annotation was removed. +// T4 (T4 > T3): MCM came back up. +// At T4 it sees a Node with no preserve annotation but a Machine with a preserve annotation. +// It continues to preserve the machine. +func getEffectivePreservationAnnotations(info *preserveStateInfo) string { // If there is no active node annotation AND no previously-applied node annotation, // enforce machine's preserve annotation. // Otherwise, the node annotation takes precedence (even if now empty/removed). - if nodeAnnotationValue == "" && machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] == "" { - return machineAnnotations[machineutils.PreserveMachineAnnotationKey], machineAnnotations - } - clonedMachineAnnotations := make(map[string]string) - maps.Copy(clonedMachineAnnotations, machineAnnotations) - delete(clonedMachineAnnotations, machineutils.PreserveMachineAnnotationKey) - clonedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeAnnotationValue - return nodeAnnotationValue, clonedMachineAnnotations + if info.nodeValue == "" && info.lastAppliedNodeValue == "" { + return info.machineValue + } + return info.nodeValue } func (c *controller) isMachinePreservationBound(ctx context.Context, machine *v1alpha1.Machine) (bool, error) { - var err error - var machineAnnotated, nodeAnnotated bool - var laNodeAV string - var node *corev1.Node +func (c *controller) isMachinePreservationBound(info *preserveStateInfo) bool { + // if machine has no preservation state, the machine is not preservation-bound + if !info.PreserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { + return false + } + return true +} +func (c *controller) getPreserveStateInfo(machine *v1alpha1.Machine) (*preserveStateInfo, error) { + var info preserveStateInfo + var err error if machine.Annotations != nil { - _, machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] - laNodeAV = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] + info.machineValue, info.machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] + info.lastAppliedNodeValue = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] + } + if machine.Status.CurrentStatus.PreserveExpiryTime != nil { + info.PreserveExpiryTimeSet = true } - nodeName := machine.Labels[v1alpha1.NodeLabelKey] if nodeName != "" { + var node *corev1.Node node, err = c.nodeLister.Get(nodeName) if err != nil { if apierrors.IsNotFound(err) { err = nil } - } else { - _, nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] + return &info, err } + info.nodeValue, info.nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] } - // if machine has no preservation state, the machine is not preservation-bound - if machine.Status.CurrentStatus.PreserveExpiryTime == nil && !nodeAnnotated && !machineAnnotated && laNodeAV == "" { - return false, err + return &info, err +} + +// shouldAnnotationsBeUpdatedOnMachine returns true when the machine's annotation tracking needs +// to be synced after a preservation action. +func shouldAnnotationsBeUpdatedOnMachine(removeAnnotations bool, preserveInfo *preserveStateInfo) bool { + // annotations were already removed by stopPreservationIfActive — nothing left to sync + if removeAnnotations { + return false + } + // node annotation is not in control — machine annotation prevails, no sync needed + if !preserveInfo.nodeAnnotated && preserveInfo.lastAppliedNodeValue == "" { + return false + } + // node value is unchanged and machine has no annotation to clear — nothing has changed + if preserveInfo.nodeValue == preserveInfo.lastAppliedNodeValue && !preserveInfo.machineAnnotated { + return false + } + return true +} + +// updatePreserveAnnotationOnMachine clears the machine's PreserveMachineAnnotationKey and sets +// LastAppliedNodePreserveValueAnnotationKey to nodeValue. +func (c *controller) updatePreserveAnnotationOnMachine(ctx context.Context, nodeValue string, machine *v1alpha1.Machine) error { + clone := machine.DeepCopy() + if clone.Annotations == nil { + clone.Annotations = make(map[string]string) + } else { + delete(clone.Annotations, machineutils.PreserveMachineAnnotationKey) } - return true, err + clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeValue + _, err := c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) + return err } From 0c147c9bc18c2246e2013369dd6e7510a444d083 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 15:34:17 +0530 Subject: [PATCH 12/19] Ensure machine preservation was not stopped before uncordoning node in manageMachinePreservation: This call is made to uncordon nodes that are preserved with preserve=now and the machine has recovered to Running. Preservation is not stopped in this case. --- .../provider/machinecontroller/machine.go | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 5d78980f61..35023df982 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -799,14 +799,16 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a clone := machine.DeepCopy() switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: - + preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) case machineutils.PreserveMachineAnnotationValueWhenFailed: // on timing out, remove preserve annotation to prevent incorrect re-preservation if machineutils.IsMachinePreservationExpired(clone) { removeAnnotations = true + preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else if !machineutils.IsMachineFailed(clone) { + preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -815,6 +817,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if machineutils.IsMachinePreservationExpired(clone) { // on timing out, remove preserve annotation to prevent incorrect re-preservation removeAnnotations = true + preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -825,6 +828,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), // in addition to stopping preservation, we also remove the preservation annotation on the machine. removeAnnotations = true + preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -833,10 +837,22 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if err != nil { return } - // This is to handle the case where a preserved machine recovers from Failed to Running - // in which case, pods should be allowed to be scheduled onto the node - if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { - err = c.uncordonNodeIfCordoned(ctx, nodeName) + // For the preserveMachine path (preservation still active), uncordon the node if the machine + // has recovered to Running. The stopPreservationIfActive path handles uncordon internally. + if !preservationStopped && clone.Status.CurrentStatus.PreserveExpiryTime != nil && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { + var node *corev1.Node + node, err = c.nodeLister.Get(nodeName) + if err != nil { + if apierrors.IsNotFound(err) { + err = nil + return + } + return + } + err = c.uncordonNodeIfCordoned(ctx, node) + if err != nil { + return + } } if shouldAnnotationsBeUpdatedOnMachine(removeAnnotations, preserveInfo) { @@ -869,7 +885,6 @@ func getEffectivePreservationAnnotations(info *preserveStateInfo) string { return info.nodeValue } -func (c *controller) isMachinePreservationBound(ctx context.Context, machine *v1alpha1.Machine) (bool, error) { func (c *controller) isMachinePreservationBound(info *preserveStateInfo) bool { // if machine has no preservation state, the machine is not preservation-bound if !info.PreserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { From e0271195269eee629856b12995b4c466b68e685b Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 15:35:07 +0530 Subject: [PATCH 13/19] Minor code changes and test refactoring to ensure conflict errors are reduced when possible --- .../provider/machinecontroller/machine.go | 11 ++- .../machinecontroller/machine_test.go | 95 +++++++++---------- .../machinecontroller/machine_util.go | 12 +-- .../machinecontroller/machine_util_test.go | 20 +--- pkg/util/provider/machinecontroller/node.go | 37 ++------ 5 files changed, 68 insertions(+), 107 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 35023df982..0bec72ecd3 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -775,12 +775,15 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a }() nodeName := machine.Labels[v1alpha1.NodeLabelKey] preserveInfo, err := c.getPreserveStateInfo(machine) + if err != nil { + return + } if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { - klog.Warningf("Preserve annotation value %q on machine %q is invalid", preserveInfo.machineValue, machine.Name) + klog.Warningf("Preserve annotation %q=%q on machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.machineValue, machine.Name) return } if preserveInfo.nodeAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.nodeValue) { - klog.Warningf("Preserve annotation value %q on node %q backing machine %q is invalid", preserveInfo.nodeValue, nodeName, machine.Name) + klog.Warningf("Preserve annotation %q=%q on node %q backing machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.nodeValue, nodeName, machine.Name) return } preservationBound := c.isMachinePreservationBound(preserveInfo) @@ -792,8 +795,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a return } - var effectivePreserveValue string - effectivePreserveValue = getEffectivePreservationAnnotations(preserveInfo) + effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo) var removeAnnotations, preservationStopped bool clone := machine.DeepCopy() @@ -909,6 +911,7 @@ func (c *controller) getPreserveStateInfo(machine *v1alpha1.Machine) (*preserveS node, err = c.nodeLister.Get(nodeName) if err != nil { if apierrors.IsNotFound(err) { + klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) err = nil } return &info, err diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 898a33d7ba..51543bce61 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -3972,11 +3972,13 @@ var _ = Describe("machine", func() { DescribeTable("getEffectivePreservationAnnotations scenarios", func(tc testCase) { - - preserveValue, updatedMachineAnnotations := getEffectivePreservationAnnotations(tc.setup.nodeAnnotationValue, tc.setup.machineAnnotations) + info := &preserveStateInfo{ + nodeValue: tc.setup.nodeAnnotationValue, + machineValue: tc.setup.machineAnnotations[machineutils.PreserveMachineAnnotationKey], + lastAppliedNodeValue: tc.setup.machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey], + } + preserveValue := getEffectivePreservationAnnotations(info) Expect(preserveValue).To(Equal(tc.expect.effectivePreserveValue)) - Expect(updatedMachineAnnotations[machineutils.PreserveMachineAnnotationKey]).To(Equal(tc.expect.machineAnnotations[machineutils.PreserveMachineAnnotationKey])) - Expect(updatedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey])) }, Entry("when node is not annotated and laNodeAnnotationValue is empty, should return machine's annotation value and empty string", testCase{ setup: setup{ @@ -4096,7 +4098,9 @@ var _ = Describe("machine", func() { Describe("#manageMachinePreservation", func() { type setup struct { machineAnnotationValue string + machineAnnotated bool nodeAnnotationValue string + nodeAnnotated bool laNodePreserveValue string nodeName string machinePhase v1alpha1.MachinePhase @@ -4127,17 +4131,19 @@ var _ = Describe("machine", func() { var targetCoreObjects []runtime.Object // Build machine + machineAnnotations := map[string]string{} + if tc.setup.machineAnnotated { + machineAnnotations[machineutils.PreserveMachineAnnotationKey] = tc.setup.machineAnnotationValue + } + if tc.setup.laNodePreserveValue != "" { + machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = tc.setup.laNodePreserveValue + } machine := &v1alpha1.Machine{ ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: "m1", - Labels: map[string]string{ - v1alpha1.NodeLabelKey: tc.setup.nodeName, - }, - Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: tc.setup.machineAnnotationValue, - machineutils.LastAppliedNodePreserveValueAnnotationKey: tc.setup.laNodePreserveValue, - }, + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: machineAnnotations, }, Status: v1alpha1.MachineStatus{ CurrentStatus: v1alpha1.CurrentStatus{ Phase: tc.setup.machinePhase, @@ -4149,12 +4155,14 @@ var _ = Describe("machine", func() { controlMachineObjects = append(controlMachineObjects, machine) if tc.setup.nodeName != "" && tc.setup.nodeName != "invalid" { + nodeAnnotations := map[string]string{} + if tc.setup.nodeAnnotated { + nodeAnnotations[machineutils.PreserveMachineAnnotationKey] = tc.setup.nodeAnnotationValue + } node := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ - Name: tc.setup.nodeName, - Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: tc.setup.nodeAnnotationValue, - }, + Name: tc.setup.nodeName, + Annotations: nodeAnnotations, }, Spec: corev1.NodeSpec{ Unschedulable: tc.setup.nodeUnschedulable, @@ -4223,6 +4231,7 @@ var _ = Describe("machine", func() { Entry("when preserve annotation 'now' is added to node of Running machine, should successfully start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4238,6 +4247,7 @@ var _ = Describe("machine", func() { Entry("when preserve annotation 'when-failed' added to node of Running machine, should not start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4251,6 +4261,7 @@ var _ = Describe("machine", func() { Entry("when node of Failed machine has annotation `when-failed`, should start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, }, @@ -4266,6 +4277,7 @@ var _ = Describe("machine", func() { Entry("when node of preserved machine is annotated with preserve value 'false', should stop preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueFalse, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4280,6 +4292,7 @@ var _ = Describe("machine", func() { Entry("when machine is annotated for auto-preservation by MCM, should start preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, }, @@ -4292,22 +4305,10 @@ var _ = Describe("machine", func() { machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, }, }), - Entry("when node is annotated and preservation times out, should stop preservation", testCase{ - setup: setup{ - nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeName: "node-1", - machinePhase: v1alpha1.MachineRunning, - preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, - }, - expect: expect{ - preserveExpiryTimeIsSet: false, - nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, - retry: machineutils.LongRetry, - }, - }), Entry("when machine is annotated with preserve=now and preservation times out, should stop preservation, and remove annotation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4321,6 +4322,7 @@ var _ = Describe("machine", func() { Entry("when machine is annotated with preserve=when-failed and preservation times out, should stop preservation, and remove annotation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4331,23 +4333,10 @@ var _ = Describe("machine", func() { retry: machineutils.LongRetry, }, }), - Entry("when invalid preserve annotation is added on node of un-preserved machine, should do nothing ", testCase{ - setup: setup{ - nodeAnnotationValue: "invalidValue", - nodeName: "node-1", - machinePhase: v1alpha1.MachineRunning, - }, - expect: expect{ - laNodePreserveValue: "invalidValue", - preserveExpiryTimeIsSet: false, - nodeCondition: nil, - retry: machineutils.LongRetry, - err: nil, - }, - }), Entry("when invalid preserve annotation is added on machine of un-preserved machine, and node is not annotated should do nothing ", testCase{ setup: setup{ machineAnnotationValue: "invalidValue", + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4362,7 +4351,7 @@ var _ = Describe("machine", func() { Entry("when a machine is annotated with preserve=now, but has no backing node, should start preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeAnnotationValue: "", + machineAnnotated: true, nodeName: "", machinePhase: v1alpha1.MachineUnknown, }, @@ -4377,6 +4366,7 @@ var _ = Describe("machine", func() { Entry("when preservation times out for a machine annotated with preserve=now, but has no backing node, should stop preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, nodeName: "", machinePhase: v1alpha1.MachineUnknown, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4391,7 +4381,7 @@ var _ = Describe("machine", func() { Entry("when a machine has a backing node, but node retrieval fails", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeAnnotationValue: "", + machineAnnotated: true, nodeName: "invalid", machinePhase: v1alpha1.MachineUnknown, }, @@ -4406,6 +4396,7 @@ var _ = Describe("machine", func() { Entry("when auto-preserved machine moves to Running, should stop preservation and remove auto-preserve annotation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4421,8 +4412,9 @@ var _ = Describe("machine", func() { Entry("when node is annotated with 'now' and machine is annotated with 'when-failed', should start preservation and remove preserve annotation from machine", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + nodeAnnotated: true, machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, - laNodePreserveValue: "", + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4440,9 +4432,9 @@ var _ = Describe("machine", func() { // case possible when MCM goes down and node annotation value is cleared and machine is annotated Entry("when node and machine are found to be annotated with \"\", and 'now', respectively and last applied node preserve value is 'now', should stop preservation", testCase{ setup: setup{ - nodeAnnotationValue: "", laNodePreserveValue: "now", machineAnnotationValue: "now", + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4461,6 +4453,7 @@ var _ = Describe("machine", func() { return testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4480,6 +4473,7 @@ var _ = Describe("machine", func() { return testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4499,6 +4493,7 @@ var _ = Describe("machine", func() { return testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, nodeUnschedulable: true, @@ -4517,6 +4512,7 @@ var _ = Describe("machine", func() { return testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, nodeUnschedulable: true, @@ -4634,13 +4630,14 @@ var _ = Describe("machine", func() { defer trackers.Stop() waitForCacheSync(stop, c) - bound, err := c.isMachinePreservationBound(context.TODO(), machine) + preserveInfo, err := c.getPreserveStateInfo(machine) if tc.expect.err != nil { Expect(err).To(HaveOccurred()) Expect(err.Error()).To(Equal(tc.expect.err.Error())) } else { Expect(err).ToNot(HaveOccurred()) } + bound := c.isMachinePreservationBound(preserveInfo) Expect(bound).To(Equal(tc.expect.bound)) }, Entry("machine has no annotations, no preserveExpiryTime, and node has no preservation annotation", testCase{ diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a3f37a845f..2d2d390e81 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2421,11 +2421,9 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al if apierrors.IsNotFound(err) { klog.Warningf("Node %q of machine %q not found. Proceeding to stop preservation on machine.", nodeName, machine.Name) // Node not found, proceed to delete annotations and clear preserveExpiryTime on machine - if removePreservationAnnotations { - machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) - if err != nil { - return nil, err - } + machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) + if err != nil { + return nil, err } machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { @@ -2450,7 +2448,7 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al return nil, err } // Step 2: remove annotations from node - err = c.removePreservationRelatedAnnotationsOnNode(ctx, updatedNode, removePreservationAnnotations) + updatedNode, err = c.removePreservationRelatedAnnotationsOnNode(ctx, updatedNode, removePreservationAnnotations) if err != nil { return nil, err } @@ -2463,7 +2461,7 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al } // Step 4: uncordon the node if the machine has recovered to Running. if machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { - if err = c.uncordonNodeIfCordoned(ctx, nodeName); err != nil { + if err = c.uncordonNodeIfCordoned(ctx, updatedNode); err != nil { return nil, err } } diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index 4813551f41..f33ac90eee 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -4097,24 +4097,6 @@ var _ = Describe("machine_util", func() { }, }, }), - Entry("when preserve=now, the machine has Failed, and the preservation is incomplete after step 1 - adding preserveExpiryTime", &testCase{ - setup: setup{ - machinePhase: machinev1.MachineFailed, - nodeName: "node-1", - preserveValue: machineutils.PreserveMachineAnnotationValueNow, - }, - expect: expect{ - err: nil, - isPreserveExpiryTimeSet: true, - isCAAnnotationPresent: true, - preserveNodeCondition: corev1.NodeCondition{ - Type: machinev1.NodePreserved, - Status: corev1.ConditionTrue, - Reason: machinev1.PreservedByUser, - Message: machinev1.PreservedNodeDrainSuccessful, - }, - }, - }), Entry("when preserve=now, the machine has Failed, and the preservation is incomplete at step 2 - adding CA annotations", &testCase{ setup: setup{ machinePhase: machinev1.MachineFailed, @@ -4624,7 +4606,7 @@ var _ = Describe("machine_util", func() { c, trackers := createController(stop, testNamespace, nil, nil, targetCoreObjects, nil, false) defer trackers.Stop() waitForCacheSync(stop, c) - err := c.removePreservationRelatedAnnotationsOnNode(context.TODO(), node, tc.setup.removePreserveAnnotation) + _, err := c.removePreservationRelatedAnnotationsOnNode(context.TODO(), node, tc.setup.removePreserveAnnotation) waitForCacheSync(stop, c) updatedNode, getErr := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), node.Name, metav1.GetOptions{}) Expect(getErr).To(BeNil()) diff --git a/pkg/util/provider/machinecontroller/node.go b/pkg/util/provider/machinecontroller/node.go index ed54d5890e..5cbafdba33 100644 --- a/pkg/util/provider/machinecontroller/node.go +++ b/pkg/util/provider/machinecontroller/node.go @@ -340,43 +340,24 @@ func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []strin return false } -func (c *controller) getNodePreserveAnnotationValue(nodeName string) (string, error) { - node, err := c.nodeLister.Get(nodeName) - if err != nil { - if apierrors.IsNotFound(err) { - return "", nil - } - klog.Errorf("error fetching node %q: %v", nodeName, err) - return "", err - } - return node.Annotations[machineutils.PreserveMachineAnnotationKey], nil -} - -func (c *controller) uncordonNodeIfCordoned(ctx context.Context, nodeName string) error { - node, err := c.nodeLister.Get(nodeName) - if err != nil { - if apierrors.IsNotFound(err) { - return nil - } - return err - } +func (c *controller) uncordonNodeIfCordoned(ctx context.Context, node *corev1.Node) error { if !node.Spec.Unschedulable { return nil } nodeClone := node.DeepCopy() nodeClone.Spec.Unschedulable = false - _, err = c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeClone, metav1.UpdateOptions{}) + _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeClone, metav1.UpdateOptions{}) if err != nil { - klog.Errorf("error uncordoning node %q: %v", nodeName, err) + klog.Errorf("error uncordoning node %q: %v", node.Name, err) } return err } // removePreservationRelatedAnnotationsOnNode removes the cluster-autoscaler annotation that disables scale down of preserved node -func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Context, node *corev1.Node, removePreserveAnnotation bool) error { +func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Context, node *corev1.Node, removePreserveAnnotation bool) (*corev1.Node, error) { // Check if annotation already absent if node.Annotations == nil { - return nil + return node, nil } updateRequired := false nodeCopy := node.DeepCopy() @@ -392,14 +373,14 @@ func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Cont updateRequired = true } if !updateRequired { - return nil + return node, nil } - _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeCopy, metav1.UpdateOptions{}) + updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeCopy, metav1.UpdateOptions{}) if err != nil { klog.Errorf("node UPDATE failed for node %q. Retrying, error: %s", node.Name, err) - return err + return nil, err } - return nil + return updatedNode, nil } // addCAScaleDownDisabledAnnotationOnNode adds the cluster-autoscaler annotation to disable scale down of preserved node From 200ce00ac429dea837992b1ab47851f961a6d0f2 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 16:26:49 +0530 Subject: [PATCH 14/19] Address review comment: Rename `PreserveMachineAnnotationValuePreservedByMCM` to `PreserveMachineAnnotationValueAutoPreserved` --- pkg/controller/controller_utils.go | 2 +- pkg/controller/deployment_machineset_util.go | 2 +- pkg/controller/machineset.go | 8 ++++---- pkg/controller/machineset_test.go | 8 ++++---- pkg/util/provider/machinecontroller/machine.go | 2 +- pkg/util/provider/machinecontroller/machine_test.go | 6 +++--- pkg/util/provider/machinecontroller/machine_util.go | 2 +- pkg/util/provider/machinecontroller/machine_util_test.go | 4 ++-- pkg/util/provider/machineutils/utils.go | 6 +++--- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 4bfda891f9..ba4fe92953 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -747,7 +747,7 @@ func (s ActiveMachines) Less(i, j int) bool { } if isPreservedI { // both machines are preserved, in which case deprioritize explicitly preserved machines over auto-preserved isAutoPreserved := func(m *v1alpha1.Machine) bool { - return m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM + return m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved } isAutoPreservedI := isAutoPreserved(s[i]) isAutoPreservedJ := isAutoPreserved(s[j]) diff --git a/pkg/controller/deployment_machineset_util.go b/pkg/controller/deployment_machineset_util.go index ec80dceb80..b198f29f61 100644 --- a/pkg/controller/deployment_machineset_util.go +++ b/pkg/controller/deployment_machineset_util.go @@ -138,7 +138,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al } // Count number of failed machines annotated with PreserveMachineAnnotationValuePreservedByMCM // Cannot combine with above if block in case auto-preservation is not complete yet - if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { autoPreserveFailedMachineCount++ } } diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 2425cadf49..0a5a82ef71 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -920,7 +920,7 @@ func (c *controller) shouldFailedMachineBeTerminated(machine *v1alpha1.Machine) return true } switch preserveValue { - case machineutils.PreserveMachineAnnotationValueWhenFailed, machineutils.PreserveMachineAnnotationValueNow, machineutils.PreserveMachineAnnotationValuePreservedByMCM: // this is in case preservation process is not complete yet + case machineutils.PreserveMachineAnnotationValueWhenFailed, machineutils.PreserveMachineAnnotationValueNow, machineutils.PreserveMachineAnnotationValueAutoPreserved: // this is in case preservation process is not complete yet return false case machineutils.PreserveMachineAnnotationValueFalse: return true @@ -978,7 +978,7 @@ func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, func (c *controller) stopAutoPreservationForMachines(ctx context.Context, machines []*v1alpha1.Machine, numToStop int) int { var autoPreservedMachines []*v1alpha1.Machine for _, m := range machines { - if m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { autoPreservedMachines = append(autoPreservedMachines, m) } } @@ -998,7 +998,7 @@ func (c *controller) stopAutoPreservationForMachines(ctx context.Context, machin klog.V(2).Infof("Removing auto-preservation annotation from machine %q as AutoPreserveFailedMachineMax is breached", m.Name) updatedMachine, err := machineutils.UpdateMachineWithRetries(ctx, c.controlMachineClient.Machines(m.Namespace), c.machineLister, m.Namespace, m.Name, removeAutoPreserveAnnotationFromMachine) if err != nil { - klog.Warningf("Error removing %q=%q annotation from machine %q: %v.", machineutils.PreserveMachineAnnotationKey, machineutils.PreserveMachineAnnotationValuePreservedByMCM, m.Name, err) + klog.Warningf("Error removing %q=%q annotation from machine %q: %v.", machineutils.PreserveMachineAnnotationKey, machineutils.PreserveMachineAnnotationValueAutoPreserved, m.Name, err) continue } autoPreservedMachines[index] = updatedMachine @@ -1011,7 +1011,7 @@ func addAutoPreserveAnnotationOnMachine(machineToUpdate *v1alpha1.Machine) error if machineToUpdate.Annotations == nil { machineToUpdate.Annotations = make(map[string]string) } - machineToUpdate.Annotations[machineutils.PreserveMachineAnnotationKey] = machineutils.PreserveMachineAnnotationValuePreservedByMCM + machineToUpdate.Annotations[machineutils.PreserveMachineAnnotationKey] = machineutils.PreserveMachineAnnotationValueAutoPreserved return nil } diff --git a/pkg/controller/machineset_test.go b/pkg/controller/machineset_test.go index fe4b89a371..176cd6ca6f 100644 --- a/pkg/controller/machineset_test.go +++ b/pkg/controller/machineset_test.go @@ -2163,10 +2163,10 @@ var _ = Describe("machineset", func() { updatedMachine3, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), testMachine3.Name, metav1.GetOptions{}) updatedMachine4, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), testMachine4.Name, metav1.GetOptions{}) preservedCount := 0 - if updatedMachine1.Annotations != nil && updatedMachine1.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine1.Annotations != nil && updatedMachine1.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } - if updatedMachine2.Annotations != nil && updatedMachine2.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine2.Annotations != nil && updatedMachine2.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } // Running machine should not be auto-preserved in any of the cases @@ -2176,7 +2176,7 @@ var _ = Describe("machineset", func() { for _, m := range tc.setup.additionalMachines { updatedMachine, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), m.Name, metav1.GetOptions{}) - if updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } } @@ -2237,7 +2237,7 @@ var _ = Describe("machineset", func() { Name: "machine-5", Namespace: testNamespace, Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, }, Status: machinev1.MachineStatus{ diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 0bec72ecd3..112ab4e95a 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -824,7 +824,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } - case machineutils.PreserveMachineAnnotationValuePreservedByMCM: + case machineutils.PreserveMachineAnnotationValueAutoPreserved: if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) { // To prevent incorrect re-preservation of a recovered, previously auto-preserved machine on future failures // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 51543bce61..920f602ba9 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4291,7 +4291,7 @@ var _ = Describe("machine", func() { }), Entry("when machine is annotated for auto-preservation by MCM, should start preservation", testCase{ setup: setup{ - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, @@ -4302,7 +4302,7 @@ var _ = Describe("machine", func() { Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, retry: machineutils.LongRetry, - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, }), Entry("when machine is annotated with preserve=now and preservation times out, should stop preservation, and remove annotation", testCase{ @@ -4395,7 +4395,7 @@ var _ = Describe("machine", func() { }), Entry("when auto-preserved machine moves to Running, should stop preservation and remove auto-preserve annotation", testCase{ setup: setup{ - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 2d2d390e81..1b89b63d07 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2525,7 +2525,7 @@ func computeNewNodePreservedCondition(currentStatus v1alpha1.CurrentStatus, pres newNodePreservedCondition.Message = fmt.Sprintf("%s %v.", preserveExpiryMessageSuffix, currentStatus.PreserveExpiryTime) needsUpdate = true } - if preserveValue == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if preserveValue == machineutils.PreserveMachineAnnotationValueAutoPreserved { newNodePreservedCondition.Reason = v1alpha1.PreservedByMCM } else { newNodePreservedCondition.Reason = v1alpha1.PreservedByUser diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index f33ac90eee..7e58a56b1a 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -4163,7 +4163,7 @@ var _ = Describe("machine_util", func() { setup: setup{ machinePhase: machinev1.MachineFailed, nodeName: "node-1", - preserveValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + preserveValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, expect: expect{ err: nil, @@ -4424,7 +4424,7 @@ var _ = Describe("machine_util", func() { LastUpdateTime: metav1.Now(), PreserveExpiryTime: preserveExpiryTime, }, - preserveValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + preserveValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, drainErr: nil, existingNodeCondition: nil, }, diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 4d536a3d82..a3b5b8c7eb 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -109,10 +109,10 @@ const ( // a Machine be preserved if and when it enters Failed phase PreserveMachineAnnotationValueWhenFailed = "when-failed" - // PreserveMachineAnnotationValuePreservedByMCM is the annotation value used by the machineset controller to + // PreserveMachineAnnotationValueAutoPreserved is the annotation value used by the machineset controller to // indicate to the machine controller that the machine must be auto-preserved. // The AutoPreserveFailedMachineMax, set on the MCD, is enforced based on the number of machines annotated with this value. - PreserveMachineAnnotationValuePreservedByMCM = "auto-preserved" + PreserveMachineAnnotationValueAutoPreserved = "auto-preserved" // PreserveMachineAnnotationValueFalse is the annotation value used to // 1) indicate to MCM that a machine must not be auto-preserved on failure @@ -121,7 +121,7 @@ const ( ) // AllowedPreserveAnnotationValues contains the allowed values for the preserve annotation -var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse) +var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValueAutoPreserved, PreserveMachineAnnotationValueFalse) // RetryPeriod is an alias for specifying the retry period type RetryPeriod time.Duration From 08b6cd376bc236a7b9457613d973c441fb0d4a5d Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 16:30:37 +0530 Subject: [PATCH 15/19] Correct docstring for `PreserveMachineAnnotationValueFalse` --- pkg/util/provider/machineutils/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index a3b5b8c7eb..1c0f99af3d 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -116,7 +116,7 @@ const ( // PreserveMachineAnnotationValueFalse is the annotation value used to // 1) indicate to MCM that a machine must not be auto-preserved on failure - // and, 2) to stop auto-preservation of a machine that is already auto-preserved by MCM. + // and, 2) to stop preservation of a machine that is already preserved by MCM. PreserveMachineAnnotationValueFalse = "false" ) From cf6d15fe683d504b7501cff09ae554d04ad7f764 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Wed, 24 Jun 2026 16:39:45 +0530 Subject: [PATCH 16/19] Log node name in `uncordonNodeIfCordoned`. --- pkg/util/provider/machinecontroller/node.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/node.go b/pkg/util/provider/machinecontroller/node.go index 5cbafdba33..174805c7c1 100644 --- a/pkg/util/provider/machinecontroller/node.go +++ b/pkg/util/provider/machinecontroller/node.go @@ -349,8 +349,10 @@ func (c *controller) uncordonNodeIfCordoned(ctx context.Context, node *corev1.No _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeClone, metav1.UpdateOptions{}) if err != nil { klog.Errorf("error uncordoning node %q: %v", node.Name, err) + return err } - return err + klog.Infof("Successfully uncordoned node %q", node.Name) + return nil } // removePreservationRelatedAnnotationsOnNode removes the cluster-autoscaler annotation that disables scale down of preserved node From cb105e790a1155672570a97a19413b59b45b2b66 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Thu, 25 Jun 2026 18:37:35 +0530 Subject: [PATCH 17/19] Address review comments - nits - if not found, machine annotation value must be enforced - if machine annotation value is invalid, log and continue assuming it does not exist - gate `removePreserveAnnotationOnMachine` when node is not found --- pkg/controller/deployment_machineset_util.go | 2 +- .../provider/machinecontroller/machine.go | 54 +-- .../machinecontroller/machine_test.go | 331 +++++++++++++++++- .../machinecontroller/machine_util.go | 9 +- 4 files changed, 366 insertions(+), 30 deletions(-) diff --git a/pkg/controller/deployment_machineset_util.go b/pkg/controller/deployment_machineset_util.go index b198f29f61..0b2294391d 100644 --- a/pkg/controller/deployment_machineset_util.go +++ b/pkg/controller/deployment_machineset_util.go @@ -136,7 +136,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al } failedMachines = append(failedMachines, machineSummary) } - // Count number of failed machines annotated with PreserveMachineAnnotationValuePreservedByMCM + // Count number of failed machines annotated with PreserveAnnotationValueAutoPreserved // Cannot combine with above if block in case auto-preservation is not complete yet if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { autoPreserveFailedMachineCount++ diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 112ab4e95a..c38926bb2f 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -753,7 +753,7 @@ type preserveStateInfo struct { nodeValue string machineValue string lastAppliedNodeValue string - PreserveExpiryTimeSet bool + preserveExpiryTimeSet bool } // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. @@ -761,7 +761,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a defer func() { if err != nil { if apierrors.IsNotFound(err) { - klog.Warningf("Error during preservation flow:%v", err.Error()) + klog.Warningf("Error during preservation flow: %v", err.Error()) retry = machineutils.LongRetry err = nil } else if apierrors.IsConflict(err) { @@ -774,13 +774,16 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } }() nodeName := machine.Labels[v1alpha1.NodeLabelKey] - preserveInfo, err := c.getPreserveStateInfo(machine) - if err != nil { + // We buffer the error returned here until we can tell if the machine is preservation-bound. + preserveInfo, getErr := c.getPreserveStateInfo(machine) + if preserveInfo == nil { // machine object not annotated with preservation state and error fetching node object return } if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { + // If machine is annotated incorrectly, log and proceed as though machine is not annotated. klog.Warningf("Preserve annotation %q=%q on machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.machineValue, machine.Name) - return + preserveInfo.machineAnnotated = false + preserveInfo.machineValue = "" } if preserveInfo.nodeAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.nodeValue) { klog.Warningf("Preserve annotation %q=%q on node %q backing machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.nodeValue, nodeName, machine.Name) @@ -788,14 +791,20 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a } preservationBound := c.isMachinePreservationBound(preserveInfo) if !preservationBound { - // we clear the error here to prevent preservation logic from interfering with non-preservation-bound machines + // We clear the error here to prevent preservation logic from interfering with non-preservation-bound machines. err = nil return - } else if err != nil { - return + } else if getErr != nil { + if !apierrors.IsNotFound(getErr) { + err = getErr + return + } + klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) + err = nil } - - effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo) + // Note: when the backing node cannot be found, we assume the machine's annotation value needs to be enforced to enable + // preservation of the machine object. + effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo, getErr) var removeAnnotations, preservationStopped bool clone := machine.DeepCopy() @@ -869,7 +878,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a // enforce machine's preserve annotation. // Otherwise, the node annotation takes precedence (even if now empty/removed). // -// This is required to handle the following scenario: +// lastAppliedNodeValue is required to handle the following scenario: // // T1: Node and Machine both have the same annotation with the same value. (MCM is up and running). // T2 (T2 > T1): MCM went down. @@ -877,7 +886,12 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a // T4 (T4 > T3): MCM came back up. // At T4 it sees a Node with no preserve annotation but a Machine with a preserve annotation. // It continues to preserve the machine. -func getEffectivePreservationAnnotations(info *preserveStateInfo) string { +func getEffectivePreservationAnnotations(info *preserveStateInfo, getPreserveStateErr error) string { + // If the node cannot be found, nodeValue is "". + // In this case, we want the machine's annotation value to be enforced. + if apierrors.IsNotFound(getPreserveStateErr) { + return info.machineValue + } // If there is no active node annotation AND no previously-applied node annotation, // enforce machine's preserve annotation. // Otherwise, the node annotation takes precedence (even if now empty/removed). @@ -889,7 +903,7 @@ func getEffectivePreservationAnnotations(info *preserveStateInfo) string { func (c *controller) isMachinePreservationBound(info *preserveStateInfo) bool { // if machine has no preservation state, the machine is not preservation-bound - if !info.PreserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { + if !info.preserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { return false } return true @@ -897,28 +911,22 @@ func (c *controller) isMachinePreservationBound(info *preserveStateInfo) bool { func (c *controller) getPreserveStateInfo(machine *v1alpha1.Machine) (*preserveStateInfo, error) { var info preserveStateInfo - var err error if machine.Annotations != nil { info.machineValue, info.machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] info.lastAppliedNodeValue = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] } if machine.Status.CurrentStatus.PreserveExpiryTime != nil { - info.PreserveExpiryTimeSet = true + info.preserveExpiryTimeSet = true } nodeName := machine.Labels[v1alpha1.NodeLabelKey] if nodeName != "" { - var node *corev1.Node - node, err = c.nodeLister.Get(nodeName) + node, err := c.nodeLister.Get(nodeName) if err != nil { - if apierrors.IsNotFound(err) { - klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) - err = nil - } return &info, err } info.nodeValue, info.nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] } - return &info, err + return &info, nil } // shouldAnnotationsBeUpdatedOnMachine returns true when the machine's annotation tracking needs @@ -940,7 +948,7 @@ func shouldAnnotationsBeUpdatedOnMachine(removeAnnotations bool, preserveInfo *p } // updatePreserveAnnotationOnMachine clears the machine's PreserveMachineAnnotationKey and sets -// LastAppliedNodePreserveValueAnnotationKey to nodeValue. +// [machineutils.LastAppliedNodePreserveValueAnnotationKey] to nodeValue. func (c *controller) updatePreserveAnnotationOnMachine(ctx context.Context, nodeValue string, machine *v1alpha1.Machine) error { clone := machine.DeepCopy() if clone.Annotations == nil { diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 920f602ba9..29678ed945 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -3977,7 +3977,7 @@ var _ = Describe("machine", func() { machineValue: tc.setup.machineAnnotations[machineutils.PreserveMachineAnnotationKey], lastAppliedNodeValue: tc.setup.machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey], } - preserveValue := getEffectivePreservationAnnotations(info) + preserveValue := getEffectivePreservationAnnotations(info, nil) Expect(preserveValue).To(Equal(tc.expect.effectivePreserveValue)) }, Entry("when node is not annotated and laNodeAnnotationValue is empty, should return machine's annotation value and empty string", testCase{ @@ -4696,7 +4696,334 @@ var _ = Describe("machine", func() { nodeName: "node-1", includeNode: false, }, - expect: expect{bound: false}, + expect: expect{ + bound: false, + err: fmt.Errorf("node %q not found", "node-1"), + }, + }), + ) + }) + + Describe("#getPreserveStateInfo", func() { + type setup struct { + machineAnnotations map[string]string + preserveExpiryTime *metav1.Time + nodeName string + nodeAnnotations map[string]string + includeNode bool + } + type expect struct { + machineAnnotated bool + machineValue string + nodeAnnotated bool + nodeValue string + lastAppliedNodeValue string + preserveExpiryTimeSet bool + err error + } + type testCase struct { + setup setup + expect expect + } + + DescribeTable("getPreserveStateInfo scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: tc.setup.machineAnnotations, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + PreserveExpiryTime: tc.setup.preserveExpiryTime, + }, + }, + } + + var targetCoreObjects []runtime.Object + if tc.setup.includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: tc.setup.nodeName, + Annotations: tc.setup.nodeAnnotations, + }, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + info, err := c.getPreserveStateInfo(machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(tc.expect.err.Error())) + } else { + Expect(err).ToNot(HaveOccurred()) + } + Expect(info.machineAnnotated).To(Equal(tc.expect.machineAnnotated)) + Expect(info.machineValue).To(Equal(tc.expect.machineValue)) + Expect(info.nodeAnnotated).To(Equal(tc.expect.nodeAnnotated)) + Expect(info.nodeValue).To(Equal(tc.expect.nodeValue)) + Expect(info.lastAppliedNodeValue).To(Equal(tc.expect.lastAppliedNodeValue)) + Expect(info.preserveExpiryTimeSet).To(Equal(tc.expect.preserveExpiryTimeSet)) + }, + Entry("machine has no annotations and no node label", testCase{ + setup: setup{}, + expect: expect{}, + }), + Entry("machine has preserve annotation and node is not annotated", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + }, + }), + Entry("machine has last-applied node preserve value annotation but no preserve annotations on node or machine", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + }, + }), + Entry("machine has a non-nil preserveExpiryTime", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + }, + expect: expect{ + preserveExpiryTimeSet: true, + }, + }), + Entry("node has preserve annotation", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + }, + expect: expect{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + }), + Entry("node not found returns error and partial info", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: false, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + err: fmt.Errorf("node-1"), + }, + }), + Entry("both machine and node have preserve annotations", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueAutoPreserved, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, + }, + }), + ) + }) + + Describe("#shouldAnnotationsBeUpdatedOnMachine", func() { + type testCase struct { + removeAnnotations bool + preserveInfo *preserveStateInfo + expect bool + } + + DescribeTable("shouldAnnotationsBeUpdatedOnMachine scenarios", + func(tc testCase) { + Expect(shouldAnnotationsBeUpdatedOnMachine(tc.removeAnnotations, tc.preserveInfo)).To(Equal(tc.expect)) + }, + Entry("removeAnnotations=true: always returns false", testCase{ + removeAnnotations: true, + preserveInfo: &preserveStateInfo{nodeAnnotated: true, nodeValue: machineutils.PreserveMachineAnnotationValueNow}, + expect: false, + }), + Entry("node not annotated and no lastAppliedNodeValue: returns false", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{nodeAnnotated: false, lastAppliedNodeValue: ""}, + expect: false, + }), + Entry("node value unchanged and machine not annotated: returns false", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: false, + }, + expect: false, + }), + Entry("node value changed: returns true", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: false, + }, + expect: true, + }), + Entry("node value unchanged but machine is annotated: returns true", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, + }, + expect: true, + }), + Entry("lastAppliedNodeValue set without node annotation: returns true (machine annotation present)", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: false, + nodeValue: "", + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, + }, + expect: true, + }), + Entry("lastAppliedNodeValue set without node annotation and machine not annotated: returns true (node value drifted from last-applied)", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: false, + nodeValue: "", + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: false, + }, + expect: true, + }), + ) + }) + + Describe("#updatePreserveAnnotationOnMachine", func() { + type setup struct { + machineAnnotations map[string]string + } + type expect struct { + lastAppliedNodeValue string + preserveAnnotationGone bool + err error + } + type testCase struct { + setup setup + nodeValue string + expect expect + } + + DescribeTable("updatePreserveAnnotationOnMachine scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Annotations: tc.setup.machineAnnotations, + }, + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, nil, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + err := c.updatePreserveAnnotationOnMachine(context.TODO(), tc.nodeValue, machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + } else { + Expect(err).ToNot(HaveOccurred()) + updated, getErr := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), machine.Name, metav1.GetOptions{}) + Expect(getErr).ToNot(HaveOccurred()) + Expect(updated.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.lastAppliedNodeValue)) + _, hasPreserve := updated.Annotations[machineutils.PreserveMachineAnnotationKey] + Expect(hasPreserve).To(Equal(!tc.expect.preserveAnnotationGone)) + } + }, + Entry("machine has no annotations and nodeValue='now': should set last-applied='now' and leave no preserve annotation", testCase{ + setup: setup{machineAnnotations: nil}, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + preserveAnnotationGone: true, + }, + }), + Entry("machine has preserve='now' and nodeValue='now': should clear preserve annotation and set last-applied='now'", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + preserveAnnotationGone: true, + }, + }), + Entry("machine has preserve='now' and last-applied='now' and nodeValue='': should clear preserve annotation and set last-applied=''", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: "", + expect: expect{ + lastAppliedNodeValue: "", + preserveAnnotationGone: true, + }, + }), + Entry("machine has last-applied='now' and nodeValue='when-failed': should clear preserve annotation and overwrite last-applied='when-failed'", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + preserveAnnotationGone: true, + }, }), ) }) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 1b89b63d07..595f57df5f 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2420,10 +2420,11 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al // therefore, this error is handled specifically if apierrors.IsNotFound(err) { klog.Warningf("Node %q of machine %q not found. Proceeding to stop preservation on machine.", nodeName, machine.Name) - // Node not found, proceed to delete annotations and clear preserveExpiryTime on machine - machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) - if err != nil { - return nil, err + if removePreservationAnnotations { + machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) + if err != nil { + return nil, err + } } machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { From 91b947694df96f6ab89e8b8e129d175a5efa870e Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Fri, 26 Jun 2026 08:43:22 +0530 Subject: [PATCH 18/19] Address review comments - part 2 --- pkg/util/provider/machinecontroller/machine.go | 7 ++----- pkg/util/provider/machinecontroller/machine_test.go | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index c38926bb2f..0e81deebfa 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -776,9 +776,6 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a nodeName := machine.Labels[v1alpha1.NodeLabelKey] // We buffer the error returned here until we can tell if the machine is preservation-bound. preserveInfo, getErr := c.getPreserveStateInfo(machine) - if preserveInfo == nil { // machine object not annotated with preservation state and error fetching node object - return - } if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { // If machine is annotated incorrectly, log and proceed as though machine is not annotated. klog.Warningf("Preserve annotation %q=%q on machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.machineValue, machine.Name) @@ -789,7 +786,7 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a klog.Warningf("Preserve annotation %q=%q on node %q backing machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.nodeValue, nodeName, machine.Name) return } - preservationBound := c.isMachinePreservationBound(preserveInfo) + preservationBound := isMachinePreservationBound(preserveInfo) if !preservationBound { // We clear the error here to prevent preservation logic from interfering with non-preservation-bound machines. err = nil @@ -901,7 +898,7 @@ func getEffectivePreservationAnnotations(info *preserveStateInfo, getPreserveSta return info.nodeValue } -func (c *controller) isMachinePreservationBound(info *preserveStateInfo) bool { +func isMachinePreservationBound(info *preserveStateInfo) bool { // if machine has no preservation state, the machine is not preservation-bound if !info.preserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { return false diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 29678ed945..8e68a7e40d 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4637,7 +4637,7 @@ var _ = Describe("machine", func() { } else { Expect(err).ToNot(HaveOccurred()) } - bound := c.isMachinePreservationBound(preserveInfo) + bound := isMachinePreservationBound(preserveInfo) Expect(bound).To(Equal(tc.expect.bound)) }, Entry("machine has no annotations, no preserveExpiryTime, and node has no preservation annotation", testCase{ From 5ab7c920f1bdbf6c6ae38fa3a383466bb6f14b83 Mon Sep 17 00:00:00 2001 From: thiyyakat Date: Fri, 26 Jun 2026 10:54:44 +0530 Subject: [PATCH 19/19] Return machine object even on error so no accidental nil ptr de-referencing happens in future edits. Add removed test. --- pkg/util/provider/machinecontroller/machine.go | 15 +++++---------- .../provider/machinecontroller/machine_test.go | 14 ++++++++++++++ .../provider/machinecontroller/machine_util.go | 10 +++++----- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 0e81deebfa..f30ea17d32 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -799,24 +799,21 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) err = nil } - // Note: when the backing node cannot be found, we assume the machine's annotation value needs to be enforced to enable + // Note: when the backing node cannot be found, we assume the machine's annotation value needs to be enforced to enable // preservation of the machine object. effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo, getErr) - var removeAnnotations, preservationStopped bool + var removeAnnotations bool clone := machine.DeepCopy() switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: - preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) case machineutils.PreserveMachineAnnotationValueWhenFailed: // on timing out, remove preserve annotation to prevent incorrect re-preservation if machineutils.IsMachinePreservationExpired(clone) { removeAnnotations = true - preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else if !machineutils.IsMachineFailed(clone) { - preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -825,7 +822,6 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if machineutils.IsMachinePreservationExpired(clone) { // on timing out, remove preserve annotation to prevent incorrect re-preservation removeAnnotations = true - preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -836,7 +832,6 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), // in addition to stopping preservation, we also remove the preservation annotation on the machine. removeAnnotations = true - preservationStopped = true clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) @@ -845,9 +840,9 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a if err != nil { return } - // For the preserveMachine path (preservation still active), uncordon the node if the machine - // has recovered to Running. The stopPreservationIfActive path handles uncordon internally. - if !preservationStopped && clone.Status.CurrentStatus.PreserveExpiryTime != nil && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { + // For the preserve=now path (preservation still active), uncordon the node if the machine + // has recovered to Running. For all other cases, stopPreservationIfActive path handles uncordon internally. + if clone.Status.CurrentStatus.PreserveExpiryTime != nil && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { var node *corev1.Node node, err = c.nodeLister.Get(nodeName) if err != nil { diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 8e68a7e40d..fbfab2fe5a 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -4348,6 +4348,20 @@ var _ = Describe("machine", func() { err: nil, }, }), + Entry("when invalid preserve annotation is added on node of un-preserved machine, should do nothing", testCase{ + setup: setup{ + nodeAnnotationValue: "invalidValue", + nodeAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, + }, + expect: expect{ + preserveExpiryTimeIsSet: false, + nodeCondition: nil, + retry: machineutils.LongRetry, + err: nil, + }, + }), Entry("when a machine is annotated with preserve=now, but has no backing node, should start preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 595f57df5f..770c9a5620 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2341,7 +2341,7 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach // Step 1: Add preserveExpiryTime to machine status machine, err = c.setPreserveExpiryTimeOnMachine(ctx, machine) if err != nil { - return nil, err + return machine, err } } if nodeName == "" { @@ -2353,7 +2353,7 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach node, err := c.nodeLister.Get(nodeName) if err != nil { klog.Errorf("error trying to get node %q of machine %q: %v. Retrying.", nodeName, machine.Name, err) - return nil, err + return machine, err } existingNodePreservedCondition := nodeops.GetCondition(node, v1alpha1.NodePreserved) // checks if preservation is already complete @@ -2363,7 +2363,7 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach // Step 2: Add annotations to prevent scale down of node by CA updatedNode, err := c.addCAScaleDownDisabledAnnotationOnNode(ctx, node) if err != nil { - return nil, err + return machine, err } var drainErr error if shouldPreservedNodeBeDrained(existingNodePreservedCondition, machine.Status.CurrentStatus.Phase) { @@ -2376,11 +2376,11 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach _, err = nodeops.AddOrUpdateConditionsOnNode(ctx, c.targetCoreClient, updatedNode.Name, *newCond) if drainErr != nil { klog.Errorf("error draining preserved node %q for machine %q : %v", nodeName, machine.Name, drainErr) - return nil, drainErr + return machine, drainErr } if err != nil { klog.Errorf("error trying to update node preserved condition for node %q of machine %q : %v", nodeName, machine.Name, err) - return nil, err + return machine, err } } klog.V(2).Infof("Machine %q and backing node %q preserved successfully till %v.", machine.Name, nodeName, machine.Status.CurrentStatus.PreserveExpiryTime)