diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 4bfda891f9..ba4fe92953 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -747,7 +747,7 @@ func (s ActiveMachines) Less(i, j int) bool { } if isPreservedI { // both machines are preserved, in which case deprioritize explicitly preserved machines over auto-preserved isAutoPreserved := func(m *v1alpha1.Machine) bool { - return m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM + return m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved } isAutoPreservedI := isAutoPreserved(s[i]) isAutoPreservedJ := isAutoPreserved(s[j]) diff --git a/pkg/controller/deployment_machineset_util.go b/pkg/controller/deployment_machineset_util.go index ec80dceb80..0b2294391d 100644 --- a/pkg/controller/deployment_machineset_util.go +++ b/pkg/controller/deployment_machineset_util.go @@ -136,9 +136,9 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al } failedMachines = append(failedMachines, machineSummary) } - // Count number of failed machines annotated with PreserveMachineAnnotationValuePreservedByMCM + // Count number of failed machines annotated with PreserveAnnotationValueAutoPreserved // Cannot combine with above if block in case auto-preservation is not complete yet - if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { autoPreserveFailedMachineCount++ } } diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index ebdff675ec..0a5a82ef71 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -920,7 +920,7 @@ func (c *controller) shouldFailedMachineBeTerminated(machine *v1alpha1.Machine) return true } switch preserveValue { - case machineutils.PreserveMachineAnnotationValueWhenFailed, machineutils.PreserveMachineAnnotationValueNow, machineutils.PreserveMachineAnnotationValuePreservedByMCM: // this is in case preservation process is not complete yet + case machineutils.PreserveMachineAnnotationValueWhenFailed, machineutils.PreserveMachineAnnotationValueNow, machineutils.PreserveMachineAnnotationValueAutoPreserved: // this is in case preservation process is not complete yet return false case machineutils.PreserveMachineAnnotationValueFalse: return true @@ -949,7 +949,7 @@ func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, var others []*v1alpha1.Machine for _, m := range machines { // check if machine is already annotated for preservation, if yes, skip. Machine controller will take care of the rest. - if machineutils.IsMachineFailed(m) && !machineutils.PreventAutoPreserveAnnotationValues.Has(m.Annotations[machineutils.PreserveMachineAnnotationKey]) { + if machineutils.IsMachineFailed(m) && !machineutils.AllowedPreserveAnnotationValues.Has(m.Annotations[machineutils.PreserveMachineAnnotationKey]) { autoPreservationCandidates = append(autoPreservationCandidates, m) } else { others = append(others, m) @@ -978,7 +978,7 @@ func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, func (c *controller) stopAutoPreservationForMachines(ctx context.Context, machines []*v1alpha1.Machine, numToStop int) int { var autoPreservedMachines []*v1alpha1.Machine for _, m := range machines { - if m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if m.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { autoPreservedMachines = append(autoPreservedMachines, m) } } @@ -998,7 +998,7 @@ func (c *controller) stopAutoPreservationForMachines(ctx context.Context, machin klog.V(2).Infof("Removing auto-preservation annotation from machine %q as AutoPreserveFailedMachineMax is breached", m.Name) updatedMachine, err := machineutils.UpdateMachineWithRetries(ctx, c.controlMachineClient.Machines(m.Namespace), c.machineLister, m.Namespace, m.Name, removeAutoPreserveAnnotationFromMachine) if err != nil { - klog.Warningf("Error removing %q=%q annotation from machine %q: %v.", machineutils.PreserveMachineAnnotationKey, machineutils.PreserveMachineAnnotationValuePreservedByMCM, m.Name, err) + klog.Warningf("Error removing %q=%q annotation from machine %q: %v.", machineutils.PreserveMachineAnnotationKey, machineutils.PreserveMachineAnnotationValueAutoPreserved, m.Name, err) continue } autoPreservedMachines[index] = updatedMachine @@ -1011,7 +1011,7 @@ func addAutoPreserveAnnotationOnMachine(machineToUpdate *v1alpha1.Machine) error if machineToUpdate.Annotations == nil { machineToUpdate.Annotations = make(map[string]string) } - machineToUpdate.Annotations[machineutils.PreserveMachineAnnotationKey] = machineutils.PreserveMachineAnnotationValuePreservedByMCM + machineToUpdate.Annotations[machineutils.PreserveMachineAnnotationKey] = machineutils.PreserveMachineAnnotationValueAutoPreserved return nil } diff --git a/pkg/controller/machineset_test.go b/pkg/controller/machineset_test.go index fe4b89a371..176cd6ca6f 100644 --- a/pkg/controller/machineset_test.go +++ b/pkg/controller/machineset_test.go @@ -2163,10 +2163,10 @@ var _ = Describe("machineset", func() { updatedMachine3, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), testMachine3.Name, metav1.GetOptions{}) updatedMachine4, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), testMachine4.Name, metav1.GetOptions{}) preservedCount := 0 - if updatedMachine1.Annotations != nil && updatedMachine1.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine1.Annotations != nil && updatedMachine1.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } - if updatedMachine2.Annotations != nil && updatedMachine2.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine2.Annotations != nil && updatedMachine2.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } // Running machine should not be auto-preserved in any of the cases @@ -2176,7 +2176,7 @@ var _ = Describe("machineset", func() { for _, m := range tc.setup.additionalMachines { updatedMachine, _ := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), m.Name, metav1.GetOptions{}) - if updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValueAutoPreserved { preservedCount++ } } @@ -2237,7 +2237,7 @@ var _ = Describe("machineset", func() { Name: "machine-5", Namespace: testNamespace, Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, }, Status: machinev1.MachineStatus{ diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index fc05c38c76..f30ea17d32 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -744,22 +744,27 @@ func (c *controller) isCreationProcessing(machine *v1alpha1.Machine) bool { Machine Preservation operations */ +// preserveStateInfo encapsulates the preservation annotation values found +// on the machine and node objects, along with the effective preservation value for the machine +// and the last applied node preserve value by MCM. +type preserveStateInfo struct { + nodeAnnotated bool + machineAnnotated bool + nodeValue string + machineValue string + lastAppliedNodeValue string + preserveExpiryTimeSet bool +} + // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1alpha1.Machine) (retry machineutils.RetryPeriod, err error) { - machineAnnotationsSynced := false - clone := machine.DeepCopy() defer func() { - // This needs to be done for cases when machine is neither preserved nor un-preserved (e.g. when preserve changes from now to when-failed on a Failed machine), - // but the LastAppliedNodePreserveValueAnnotation needs to be synced to the machine object. - // We compare annotation value in the clone with the original machine object to see if an update is required - if err == nil && !machineAnnotationsSynced && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { - _, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Errorf("error updating LastAppliedNodePreserveValueAnnotation value on machine %q: %v", machine.Name, err) - } - } if err != nil { - if apierrors.IsConflict(err) { + if apierrors.IsNotFound(err) { + klog.Warningf("Error during preservation flow: %v", err.Error()) + retry = machineutils.LongRetry + err = nil + } else if apierrors.IsConflict(err) { retry = machineutils.ConflictRetry } else { retry = machineutils.ShortRetry @@ -768,76 +773,182 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a retry = machineutils.LongRetry } }() - - nodeName := clone.Labels[v1alpha1.NodeLabelKey] - nodeAnnotationValue := "" - if nodeName != "" { - nodeAnnotationValue, err = c.getNodePreserveAnnotationValue(nodeName) - if err != nil { - return - } + nodeName := machine.Labels[v1alpha1.NodeLabelKey] + // We buffer the error returned here until we can tell if the machine is preservation-bound. + preserveInfo, getErr := c.getPreserveStateInfo(machine) + if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { + // If machine is annotated incorrectly, log and proceed as though machine is not annotated. + klog.Warningf("Preserve annotation %q=%q on machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.machineValue, machine.Name) + preserveInfo.machineAnnotated = false + preserveInfo.machineValue = "" + } + if preserveInfo.nodeAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.nodeValue) { + klog.Warningf("Preserve annotation %q=%q on node %q backing machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.nodeValue, nodeName, machine.Name) + return } - var effectivePreserveValue string - effectivePreserveValue, clone.Annotations = getEffectivePreservationAnnotations(nodeAnnotationValue, clone.Annotations) - if !machineutils.AllowedPreserveAnnotationValues.Has(effectivePreserveValue) { - if effectivePreserveValue == nodeAnnotationValue { - klog.Warningf("Preserve annotation value %q on node %q is invalid", effectivePreserveValue, nodeName) - } else { - klog.Warningf("Preserve annotation value %q on machine %q is invalid", effectivePreserveValue, clone.Name) - } + preservationBound := isMachinePreservationBound(preserveInfo) + if !preservationBound { + // We clear the error here to prevent preservation logic from interfering with non-preservation-bound machines. + err = nil return + } else if getErr != nil { + if !apierrors.IsNotFound(getErr) { + err = getErr + return + } + klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) + err = nil } + // Note: when the backing node cannot be found, we assume the machine's annotation value needs to be enforced to enable + // preservation of the machine object. + effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo, getErr) + + var removeAnnotations bool + clone := machine.DeepCopy() switch effectivePreserveValue { case "", machineutils.PreserveMachineAnnotationValueFalse: - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) case machineutils.PreserveMachineAnnotationValueWhenFailed: // on timing out, remove preserve annotation to prevent incorrect re-preservation if machineutils.IsMachinePreservationExpired(clone) { - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else if !machineutils.IsMachineFailed(clone) { - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } case machineutils.PreserveMachineAnnotationValueNow: if machineutils.IsMachinePreservationExpired(clone) { // on timing out, remove preserve annotation to prevent incorrect re-preservation - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } - case machineutils.PreserveMachineAnnotationValuePreservedByMCM: + case machineutils.PreserveMachineAnnotationValueAutoPreserved: if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) { // To prevent incorrect re-preservation of a recovered, previously auto-preserved machine on future failures // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), // in addition to stopping preservation, we also remove the preservation annotation on the machine. - machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) + removeAnnotations = true + clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) } else { - err = c.preserveMachine(ctx, clone, effectivePreserveValue) + clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) } } if err != nil { return } - // This is to handle the case where a preserved machine recovers from Failed to Running - // in which case, pods should be allowed to be scheduled onto the node - if machineutils.IsMachineActive(clone) && nodeName != "" { - err = c.uncordonNodeIfCordoned(ctx, nodeName) + // For the preserve=now path (preservation still active), uncordon the node if the machine + // has recovered to Running. For all other cases, stopPreservationIfActive path handles uncordon internally. + if clone.Status.CurrentStatus.PreserveExpiryTime != nil && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { + var node *corev1.Node + node, err = c.nodeLister.Get(nodeName) + if err != nil { + if apierrors.IsNotFound(err) { + err = nil + return + } + return + } + err = c.uncordonNodeIfCordoned(ctx, node) + if err != nil { + return + } + } + + if shouldAnnotationsBeUpdatedOnMachine(removeAnnotations, preserveInfo) { + err = c.updatePreserveAnnotationOnMachine(ctx, preserveInfo.nodeValue, clone) } return } -// getEffectivePreservationAnnotations returns the effective preservation value, and updates the machine Annotations related to preservation -func getEffectivePreservationAnnotations(nodeAnnotationValue string, machineAnnotations map[string]string) (string, map[string]string) { +// getEffectivePreservationAnnotations returns the effective preservation value. +// +// If there is no active node annotation AND no previously-applied node annotation, +// enforce machine's preserve annotation. +// Otherwise, the node annotation takes precedence (even if now empty/removed). +// +// lastAppliedNodeValue is required to handle the following scenario: +// +// T1: Node and Machine both have the same annotation with the same value. (MCM is up and running). +// T2 (T2 > T1): MCM went down. +// T3 (T3 > T2): Node annotation was removed. +// T4 (T4 > T3): MCM came back up. +// At T4 it sees a Node with no preserve annotation but a Machine with a preserve annotation. +// It continues to preserve the machine. +func getEffectivePreservationAnnotations(info *preserveStateInfo, getPreserveStateErr error) string { + // If the node cannot be found, nodeValue is "". + // In this case, we want the machine's annotation value to be enforced. + if apierrors.IsNotFound(getPreserveStateErr) { + return info.machineValue + } // If there is no active node annotation AND no previously-applied node annotation, // enforce machine's preserve annotation. // Otherwise, the node annotation takes precedence (even if now empty/removed). - if nodeAnnotationValue == "" && machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] == "" { - return machineAnnotations[machineutils.PreserveMachineAnnotationKey], machineAnnotations - } - clonedMachineAnnotations := make(map[string]string) - maps.Copy(clonedMachineAnnotations, machineAnnotations) - delete(clonedMachineAnnotations, machineutils.PreserveMachineAnnotationKey) - clonedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeAnnotationValue - return nodeAnnotationValue, clonedMachineAnnotations + if info.nodeValue == "" && info.lastAppliedNodeValue == "" { + return info.machineValue + } + return info.nodeValue +} + +func isMachinePreservationBound(info *preserveStateInfo) bool { + // if machine has no preservation state, the machine is not preservation-bound + if !info.preserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { + return false + } + return true +} + +func (c *controller) getPreserveStateInfo(machine *v1alpha1.Machine) (*preserveStateInfo, error) { + var info preserveStateInfo + if machine.Annotations != nil { + info.machineValue, info.machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] + info.lastAppliedNodeValue = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] + } + if machine.Status.CurrentStatus.PreserveExpiryTime != nil { + info.preserveExpiryTimeSet = true + } + nodeName := machine.Labels[v1alpha1.NodeLabelKey] + if nodeName != "" { + node, err := c.nodeLister.Get(nodeName) + if err != nil { + return &info, err + } + info.nodeValue, info.nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] + } + return &info, nil +} + +// shouldAnnotationsBeUpdatedOnMachine returns true when the machine's annotation tracking needs +// to be synced after a preservation action. +func shouldAnnotationsBeUpdatedOnMachine(removeAnnotations bool, preserveInfo *preserveStateInfo) bool { + // annotations were already removed by stopPreservationIfActive — nothing left to sync + if removeAnnotations { + return false + } + // node annotation is not in control — machine annotation prevails, no sync needed + if !preserveInfo.nodeAnnotated && preserveInfo.lastAppliedNodeValue == "" { + return false + } + // node value is unchanged and machine has no annotation to clear — nothing has changed + if preserveInfo.nodeValue == preserveInfo.lastAppliedNodeValue && !preserveInfo.machineAnnotated { + return false + } + return true +} + +// updatePreserveAnnotationOnMachine clears the machine's PreserveMachineAnnotationKey and sets +// [machineutils.LastAppliedNodePreserveValueAnnotationKey] to nodeValue. +func (c *controller) updatePreserveAnnotationOnMachine(ctx context.Context, nodeValue string, machine *v1alpha1.Machine) error { + clone := machine.DeepCopy() + if clone.Annotations == nil { + clone.Annotations = make(map[string]string) + } else { + delete(clone.Annotations, machineutils.PreserveMachineAnnotationKey) + } + clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeValue + _, err := c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) + return err } diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 15f98f5098..fbfab2fe5a 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -3972,11 +3972,13 @@ var _ = Describe("machine", func() { DescribeTable("getEffectivePreservationAnnotations scenarios", func(tc testCase) { - - preserveValue, updatedMachineAnnotations := getEffectivePreservationAnnotations(tc.setup.nodeAnnotationValue, tc.setup.machineAnnotations) + info := &preserveStateInfo{ + nodeValue: tc.setup.nodeAnnotationValue, + machineValue: tc.setup.machineAnnotations[machineutils.PreserveMachineAnnotationKey], + lastAppliedNodeValue: tc.setup.machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey], + } + preserveValue := getEffectivePreservationAnnotations(info, nil) Expect(preserveValue).To(Equal(tc.expect.effectivePreserveValue)) - Expect(updatedMachineAnnotations[machineutils.PreserveMachineAnnotationKey]).To(Equal(tc.expect.machineAnnotations[machineutils.PreserveMachineAnnotationKey])) - Expect(updatedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey])) }, Entry("when node is not annotated and laNodeAnnotationValue is empty, should return machine's annotation value and empty string", testCase{ setup: setup{ @@ -4096,11 +4098,14 @@ var _ = Describe("machine", func() { Describe("#manageMachinePreservation", func() { type setup struct { machineAnnotationValue string + machineAnnotated bool nodeAnnotationValue string + nodeAnnotated bool laNodePreserveValue string nodeName string machinePhase v1alpha1.MachinePhase preserveExpiryTime *metav1.Time + nodeUnschedulable bool } type expect struct { retry machineutils.RetryPeriod @@ -4109,6 +4114,7 @@ var _ = Describe("machine", func() { machineAnnotationValue string laNodePreserveValue string err error + nodeUnschedulable *bool } type testCase struct { setup setup @@ -4125,17 +4131,19 @@ var _ = Describe("machine", func() { var targetCoreObjects []runtime.Object // Build machine + machineAnnotations := map[string]string{} + if tc.setup.machineAnnotated { + machineAnnotations[machineutils.PreserveMachineAnnotationKey] = tc.setup.machineAnnotationValue + } + if tc.setup.laNodePreserveValue != "" { + machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = tc.setup.laNodePreserveValue + } machine := &v1alpha1.Machine{ ObjectMeta: metav1.ObjectMeta{ - Namespace: testNamespace, - Name: "m1", - Labels: map[string]string{ - v1alpha1.NodeLabelKey: tc.setup.nodeName, - }, - Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: tc.setup.machineAnnotationValue, - machineutils.LastAppliedNodePreserveValueAnnotationKey: tc.setup.laNodePreserveValue, - }, + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: machineAnnotations, }, Status: v1alpha1.MachineStatus{ CurrentStatus: v1alpha1.CurrentStatus{ Phase: tc.setup.machinePhase, @@ -4147,12 +4155,17 @@ var _ = Describe("machine", func() { controlMachineObjects = append(controlMachineObjects, machine) if tc.setup.nodeName != "" && tc.setup.nodeName != "invalid" { + nodeAnnotations := map[string]string{} + if tc.setup.nodeAnnotated { + nodeAnnotations[machineutils.PreserveMachineAnnotationKey] = tc.setup.nodeAnnotationValue + } node := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ - Name: tc.setup.nodeName, - Annotations: map[string]string{ - machineutils.PreserveMachineAnnotationKey: tc.setup.nodeAnnotationValue, - }, + Name: tc.setup.nodeName, + Annotations: nodeAnnotations, + }, + Spec: corev1.NodeSpec{ + Unschedulable: tc.setup.nodeUnschedulable, }, Status: corev1.NodeStatus{ Conditions: []corev1.NodeCondition{}, @@ -4180,7 +4193,7 @@ var _ = Describe("machine", func() { } else { Expect(updatedMachine.Status.CurrentStatus.PreserveExpiryTime.IsZero()).To(BeTrue()) } - if tc.setup.nodeName != "" { + if tc.setup.nodeName != "" && tc.setup.nodeName != "invalid" { updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), tc.setup.nodeName, metav1.GetOptions{}) Expect(err).ToNot(HaveOccurred()) found := false @@ -4198,6 +4211,9 @@ var _ = Describe("machine", func() { } else { Expect(found).To(BeFalse()) } + if tc.expect.nodeUnschedulable != nil { + Expect(updatedNode.Spec.Unschedulable).To(Equal(*tc.expect.nodeUnschedulable)) + } } Expect(updatedMachine.Annotations[machineutils.PreserveMachineAnnotationKey]).To(Equal(tc.expect.machineAnnotationValue)) Expect(updatedMachine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.laNodePreserveValue)) @@ -4215,6 +4231,7 @@ var _ = Describe("machine", func() { Entry("when preserve annotation 'now' is added to node of Running machine, should successfully start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4230,6 +4247,7 @@ var _ = Describe("machine", func() { Entry("when preserve annotation 'when-failed' added to node of Running machine, should not start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4243,6 +4261,7 @@ var _ = Describe("machine", func() { Entry("when node of Failed machine has annotation `when-failed`, should start preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, }, @@ -4258,6 +4277,7 @@ var _ = Describe("machine", func() { Entry("when node of preserved machine is annotated with preserve value 'false', should stop preservation", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueFalse, + nodeAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4271,7 +4291,8 @@ var _ = Describe("machine", func() { }), Entry("when machine is annotated for auto-preservation by MCM, should start preservation", testCase{ setup: setup{ - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, }, @@ -4281,25 +4302,13 @@ var _ = Describe("machine", func() { Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, retry: machineutils.LongRetry, - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, - }, - }), - Entry("when node is annotated and preservation times out, should stop preservation", testCase{ - setup: setup{ - nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeName: "node-1", - machinePhase: v1alpha1.MachineRunning, - preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, - }, - expect: expect{ - preserveExpiryTimeIsSet: false, - nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, - retry: machineutils.LongRetry, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, }), Entry("when machine is annotated with preserve=now and preservation times out, should stop preservation, and remove annotation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4313,6 +4322,7 @@ var _ = Describe("machine", func() { Entry("when machine is annotated with preserve=when-failed and preservation times out, should stop preservation, and remove annotation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineFailed, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4323,28 +4333,29 @@ var _ = Describe("machine", func() { retry: machineutils.LongRetry, }, }), - Entry("when invalid preserve annotation is added on node of un-preserved machine, should do nothing ", testCase{ + Entry("when invalid preserve annotation is added on machine of un-preserved machine, and node is not annotated should do nothing ", testCase{ setup: setup{ - nodeAnnotationValue: "invalidValue", - nodeName: "node-1", - machinePhase: v1alpha1.MachineRunning, + machineAnnotationValue: "invalidValue", + machineAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, }, expect: expect{ - laNodePreserveValue: "invalidValue", + machineAnnotationValue: "invalidValue", preserveExpiryTimeIsSet: false, nodeCondition: nil, retry: machineutils.LongRetry, err: nil, }, }), - Entry("when invalid preserve annotation is added on machine of un-preserved machine, and node is not annotated should do nothing ", testCase{ + Entry("when invalid preserve annotation is added on node of un-preserved machine, should do nothing", testCase{ setup: setup{ - machineAnnotationValue: "invalidValue", - nodeName: "node-1", - machinePhase: v1alpha1.MachineRunning, + nodeAnnotationValue: "invalidValue", + nodeAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, }, expect: expect{ - machineAnnotationValue: "invalidValue", preserveExpiryTimeIsSet: false, nodeCondition: nil, retry: machineutils.LongRetry, @@ -4354,7 +4365,7 @@ var _ = Describe("machine", func() { Entry("when a machine is annotated with preserve=now, but has no backing node, should start preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeAnnotationValue: "", + machineAnnotated: true, nodeName: "", machinePhase: v1alpha1.MachineUnknown, }, @@ -4369,6 +4380,7 @@ var _ = Describe("machine", func() { Entry("when preservation times out for a machine annotated with preserve=now, but has no backing node, should stop preservation", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, nodeName: "", machinePhase: v1alpha1.MachineUnknown, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, @@ -4383,21 +4395,22 @@ var _ = Describe("machine", func() { Entry("when a machine has a backing node, but node retrieval fails", testCase{ setup: setup{ machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - nodeAnnotationValue: "", + machineAnnotated: true, nodeName: "invalid", machinePhase: v1alpha1.MachineUnknown, }, expect: expect{ - preserveExpiryTimeIsSet: false, + preserveExpiryTimeIsSet: true, nodeCondition: nil, machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, - retry: machineutils.ShortRetry, - err: fmt.Errorf("node %q not found", "invalid"), + retry: machineutils.LongRetry, + err: nil, }, }), Entry("when auto-preserved machine moves to Running, should stop preservation and remove auto-preserve annotation", testCase{ setup: setup{ - machineAnnotationValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4413,8 +4426,9 @@ var _ = Describe("machine", func() { Entry("when node is annotated with 'now' and machine is annotated with 'when-failed', should start preservation and remove preserve annotation from machine", testCase{ setup: setup{ nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + nodeAnnotated: true, machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, - laNodePreserveValue: "", + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, }, @@ -4432,9 +4446,9 @@ var _ = Describe("machine", func() { // case possible when MCM goes down and node annotation value is cleared and machine is annotated Entry("when node and machine are found to be annotated with \"\", and 'now', respectively and last applied node preserve value is 'now', should stop preservation", testCase{ setup: setup{ - nodeAnnotationValue: "", laNodePreserveValue: "now", machineAnnotationValue: "now", + machineAnnotated: true, nodeName: "node-1", machinePhase: v1alpha1.MachineRunning, preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, @@ -4448,6 +4462,583 @@ var _ = Describe("machine", func() { err: nil, }, }), + Entry("when preserve=when-failed machine with cordoned node recovers to Running, should stop preservation and uncordon node", func() testCase { + uncordoned := false + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: false, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + retry: machineutils.LongRetry, + nodeUnschedulable: &uncordoned, + }, + } + }()), + Entry("when preserve=when-failed machine with cordoned node has expired preservation while Running, should stop preservation and uncordon node", func() testCase { + uncordoned := false + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineRunning, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(-1 * time.Minute)}, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: false, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionFalse}, + machineAnnotationValue: "", + retry: machineutils.LongRetry, + nodeUnschedulable: &uncordoned, + }, + } + }()), + Entry("when preserve=when-failed machine is Failed with active preservation and cordoned node, node should stay cordoned", func() testCase { + cordoned := true + return testCase{ + setup: setup{ + nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineFailed, + nodeUnschedulable: true, + }, + expect: expect{ + laNodePreserveValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + preserveExpiryTimeIsSet: true, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, + retry: machineutils.LongRetry, + nodeUnschedulable: &cordoned, + }, + } + }()), + Entry("when preserve=now machine is Failed with active preservation and cordoned node, node should stay cordoned", func() testCase { + cordoned := true + return testCase{ + setup: setup{ + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, + nodeName: "node-1", + machinePhase: v1alpha1.MachineFailed, + nodeUnschedulable: true, + }, + expect: expect{ + preserveExpiryTimeIsSet: true, + nodeCondition: &corev1.NodeCondition{Type: v1alpha1.NodePreserved, Status: corev1.ConditionTrue}, + machineAnnotationValue: machineutils.PreserveMachineAnnotationValueNow, + retry: machineutils.LongRetry, + nodeUnschedulable: &cordoned, + }, + } + }()), + ) + + DescribeTable("when machine is not preservation-bound, should return early without modifying the machine", + func(nodeUnschedulable bool, includeNode bool) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: "node-1"}, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineRunning}, + }, + } + + var targetCoreObjects []runtime.Object + if includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Spec: corev1.NodeSpec{Unschedulable: nodeUnschedulable}, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + retry, err := c.manageMachinePreservation(context.TODO(), machine) + + Expect(err).ToNot(HaveOccurred()) + Expect(retry).To(Equal(machineutils.LongRetry)) + + updatedMachine, err := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), machine.Name, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(updatedMachine.Annotations).To(Equal(machine.Annotations)) + Expect(updatedMachine.Labels).To(Equal(machine.Labels)) + Expect(updatedMachine.Spec).To(Equal(machine.Spec)) + Expect(updatedMachine.Status).To(Equal(machine.Status)) + + if includeNode { + updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), "node-1", metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(updatedNode.Spec.Unschedulable).To(Equal(nodeUnschedulable)) + } + }, + Entry("node is cordoned", true, true), + Entry("node is uncordoned", false, true), + Entry("node retrieval fails (node not found)", false, false), + ) + }) + + Describe("#isMachinePreservationBound", func() { + type setup struct { + machineAnnotations map[string]string + preserveExpiryTime *metav1.Time + nodeName string + nodeAnnotations map[string]string + includeNode bool + } + type expect struct { + bound bool + err error + } + type testCase struct { + setup setup + expect expect + } + + DescribeTable("isMachinePreservationBound scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: tc.setup.machineAnnotations, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + PreserveExpiryTime: tc.setup.preserveExpiryTime, + }, + }, + } + + var targetCoreObjects []runtime.Object + if tc.setup.includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: tc.setup.nodeName, + Annotations: tc.setup.nodeAnnotations, + }, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + preserveInfo, err := c.getPreserveStateInfo(machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(Equal(tc.expect.err.Error())) + } else { + Expect(err).ToNot(HaveOccurred()) + } + bound := isMachinePreservationBound(preserveInfo) + Expect(bound).To(Equal(tc.expect.bound)) + }, + Entry("machine has no annotations, no preserveExpiryTime, and node has no preservation annotation", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + }, + expect: expect{bound: false}, + }), + Entry("machine has no node label and no other preservation markers", testCase{ + setup: setup{ + nodeName: "", + }, + expect: expect{bound: false}, + }), + Entry("machine has preserve annotation set", testCase{ + setup: setup{ + nodeName: "node-1", + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine has a non-nil preserveExpiryTime", testCase{ + setup: setup{ + nodeName: "node-1", + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine's node has a preservation annotation", testCase{ + setup: setup{ + nodeName: "node-1", + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("machine has last-applied node preserve value annotation", testCase{ + setup: setup{ + nodeName: "node-1", + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + includeNode: true, + }, + expect: expect{bound: true}, + }), + Entry("node is not found and machine has no preservation markers", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: false, + }, + expect: expect{ + bound: false, + err: fmt.Errorf("node %q not found", "node-1"), + }, + }), + ) + }) + + Describe("#getPreserveStateInfo", func() { + type setup struct { + machineAnnotations map[string]string + preserveExpiryTime *metav1.Time + nodeName string + nodeAnnotations map[string]string + includeNode bool + } + type expect struct { + machineAnnotated bool + machineValue string + nodeAnnotated bool + nodeValue string + lastAppliedNodeValue string + preserveExpiryTimeSet bool + err error + } + type testCase struct { + setup setup + expect expect + } + + DescribeTable("getPreserveStateInfo scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Labels: map[string]string{v1alpha1.NodeLabelKey: tc.setup.nodeName}, + Annotations: tc.setup.machineAnnotations, + }, + Status: v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + PreserveExpiryTime: tc.setup.preserveExpiryTime, + }, + }, + } + + var targetCoreObjects []runtime.Object + if tc.setup.includeNode { + targetCoreObjects = append(targetCoreObjects, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: tc.setup.nodeName, + Annotations: tc.setup.nodeAnnotations, + }, + }) + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, targetCoreObjects, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + info, err := c.getPreserveStateInfo(machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(tc.expect.err.Error())) + } else { + Expect(err).ToNot(HaveOccurred()) + } + Expect(info.machineAnnotated).To(Equal(tc.expect.machineAnnotated)) + Expect(info.machineValue).To(Equal(tc.expect.machineValue)) + Expect(info.nodeAnnotated).To(Equal(tc.expect.nodeAnnotated)) + Expect(info.nodeValue).To(Equal(tc.expect.nodeValue)) + Expect(info.lastAppliedNodeValue).To(Equal(tc.expect.lastAppliedNodeValue)) + Expect(info.preserveExpiryTimeSet).To(Equal(tc.expect.preserveExpiryTimeSet)) + }, + Entry("machine has no annotations and no node label", testCase{ + setup: setup{}, + expect: expect{}, + }), + Entry("machine has preserve annotation and node is not annotated", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + }, + }), + Entry("machine has last-applied node preserve value annotation but no preserve annotations on node or machine", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + }, + }), + Entry("machine has a non-nil preserveExpiryTime", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + preserveExpiryTime: &metav1.Time{Time: metav1.Now().Add(1 * time.Hour)}, + }, + expect: expect{ + preserveExpiryTimeSet: true, + }, + }), + Entry("node has preserve annotation", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + }, + expect: expect{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + }), + Entry("node not found returns error and partial info", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: false, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + err: fmt.Errorf("node-1"), + }, + }), + Entry("both machine and node have preserve annotations", testCase{ + setup: setup{ + nodeName: "node-1", + includeNode: true, + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueWhenFailed, + }, + nodeAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueAutoPreserved, + }, + }, + expect: expect{ + machineAnnotated: true, + machineValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, + }, + }), + ) + }) + + Describe("#shouldAnnotationsBeUpdatedOnMachine", func() { + type testCase struct { + removeAnnotations bool + preserveInfo *preserveStateInfo + expect bool + } + + DescribeTable("shouldAnnotationsBeUpdatedOnMachine scenarios", + func(tc testCase) { + Expect(shouldAnnotationsBeUpdatedOnMachine(tc.removeAnnotations, tc.preserveInfo)).To(Equal(tc.expect)) + }, + Entry("removeAnnotations=true: always returns false", testCase{ + removeAnnotations: true, + preserveInfo: &preserveStateInfo{nodeAnnotated: true, nodeValue: machineutils.PreserveMachineAnnotationValueNow}, + expect: false, + }), + Entry("node not annotated and no lastAppliedNodeValue: returns false", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{nodeAnnotated: false, lastAppliedNodeValue: ""}, + expect: false, + }), + Entry("node value unchanged and machine not annotated: returns false", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: false, + }, + expect: false, + }), + Entry("node value changed: returns true", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + machineAnnotated: false, + }, + expect: true, + }), + Entry("node value unchanged but machine is annotated: returns true", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: true, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, + }, + expect: true, + }), + Entry("lastAppliedNodeValue set without node annotation: returns true (machine annotation present)", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: false, + nodeValue: "", + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: true, + }, + expect: true, + }), + Entry("lastAppliedNodeValue set without node annotation and machine not annotated: returns true (node value drifted from last-applied)", testCase{ + removeAnnotations: false, + preserveInfo: &preserveStateInfo{ + nodeAnnotated: false, + nodeValue: "", + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + machineAnnotated: false, + }, + expect: true, + }), + ) + }) + + Describe("#updatePreserveAnnotationOnMachine", func() { + type setup struct { + machineAnnotations map[string]string + } + type expect struct { + lastAppliedNodeValue string + preserveAnnotationGone bool + err error + } + type testCase struct { + setup setup + nodeValue string + expect expect + } + + DescribeTable("updatePreserveAnnotationOnMachine scenarios", + func(tc testCase) { + stop := make(chan struct{}) + defer close(stop) + + machine := &v1alpha1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNamespace, + Name: "m1", + Annotations: tc.setup.machineAnnotations, + }, + } + + c, trackers := createController(stop, testNamespace, []runtime.Object{machine}, nil, nil, nil, false) + defer trackers.Stop() + waitForCacheSync(stop, c) + + err := c.updatePreserveAnnotationOnMachine(context.TODO(), tc.nodeValue, machine) + if tc.expect.err != nil { + Expect(err).To(HaveOccurred()) + } else { + Expect(err).ToNot(HaveOccurred()) + updated, getErr := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), machine.Name, metav1.GetOptions{}) + Expect(getErr).ToNot(HaveOccurred()) + Expect(updated.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey]).To(Equal(tc.expect.lastAppliedNodeValue)) + _, hasPreserve := updated.Annotations[machineutils.PreserveMachineAnnotationKey] + Expect(hasPreserve).To(Equal(!tc.expect.preserveAnnotationGone)) + } + }, + Entry("machine has no annotations and nodeValue='now': should set last-applied='now' and leave no preserve annotation", testCase{ + setup: setup{machineAnnotations: nil}, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + preserveAnnotationGone: true, + }, + }), + Entry("machine has preserve='now' and nodeValue='now': should clear preserve annotation and set last-applied='now'", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: machineutils.PreserveMachineAnnotationValueNow, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueNow, + preserveAnnotationGone: true, + }, + }), + Entry("machine has preserve='now' and last-applied='now' and nodeValue='': should clear preserve annotation and set last-applied=''", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.PreserveMachineAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: "", + expect: expect{ + lastAppliedNodeValue: "", + preserveAnnotationGone: true, + }, + }), + Entry("machine has last-applied='now' and nodeValue='when-failed': should clear preserve annotation and overwrite last-applied='when-failed'", testCase{ + setup: setup{ + machineAnnotations: map[string]string{ + machineutils.LastAppliedNodePreserveValueAnnotationKey: machineutils.PreserveMachineAnnotationValueNow, + }, + }, + nodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + expect: expect{ + lastAppliedNodeValue: machineutils.PreserveMachineAnnotationValueWhenFailed, + preserveAnnotationGone: true, + }, + }), ) }) }) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 7c08640b5c..770c9a5620 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -2333,7 +2333,7 @@ Utility Functions for Machine Preservation */ // preserveMachine contains logic to start the preservation of a machine and node. -func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Machine, preserveValue string) error { +func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Machine, preserveValue string) (*v1alpha1.Machine, error) { var err error nodeName := machine.Labels[v1alpha1.NodeLabelKey] if machine.Status.CurrentStatus.PreserveExpiryTime == nil { @@ -2341,29 +2341,29 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach // Step 1: Add preserveExpiryTime to machine status machine, err = c.setPreserveExpiryTimeOnMachine(ctx, machine) if err != nil { - return err + return machine, err } } if nodeName == "" { // Machine has no backing node, preservation is complete klog.V(2).Infof("Machine %q without backing node is preserved successfully till %v.", machine.Name, machine.Status.CurrentStatus.PreserveExpiryTime) - return nil + return machine, nil } // Machine has a backing node node, err := c.nodeLister.Get(nodeName) if err != nil { klog.Errorf("error trying to get node %q of machine %q: %v. Retrying.", nodeName, machine.Name, err) - return err + return machine, err } existingNodePreservedCondition := nodeops.GetCondition(node, v1alpha1.NodePreserved) // checks if preservation is already complete if existingNodePreservedCondition != nil && existingNodePreservedCondition.Status == v1.ConditionTrue { - return nil + return machine, nil } // Step 2: Add annotations to prevent scale down of node by CA updatedNode, err := c.addCAScaleDownDisabledAnnotationOnNode(ctx, node) if err != nil { - return err + return machine, err } var drainErr error if shouldPreservedNodeBeDrained(existingNodePreservedCondition, machine.Status.CurrentStatus.Phase) { @@ -2376,41 +2376,41 @@ func (c *controller) preserveMachine(ctx context.Context, machine *v1alpha1.Mach _, err = nodeops.AddOrUpdateConditionsOnNode(ctx, c.targetCoreClient, updatedNode.Name, *newCond) if drainErr != nil { klog.Errorf("error draining preserved node %q for machine %q : %v", nodeName, machine.Name, drainErr) - return drainErr + return machine, drainErr } if err != nil { klog.Errorf("error trying to update node preserved condition for node %q of machine %q : %v", nodeName, machine.Name, err) - return err + return machine, err } } - klog.V(2).Infof("Machine %q and backing node preserved successfully till %v.", machine.Name, machine.Status.CurrentStatus.PreserveExpiryTime) - return nil + klog.V(2).Infof("Machine %q and backing node %q preserved successfully till %v.", machine.Name, nodeName, machine.Status.CurrentStatus.PreserveExpiryTime) + return machine, nil } // stopPreservationIfActive stops the preservation of the machine and node, if preserved, and returns true if machine object has been updated -func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1alpha1.Machine, removePreservationAnnotations bool) (bool, error) { +func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1alpha1.Machine, removePreservationAnnotations bool) (*v1alpha1.Machine, error) { + var err error // removal of preserveExpiryTime is the last step of stopping preservation // therefore, if preserveExpiryTime is not set, machine is not preserved nodeName := machine.Labels[v1alpha1.NodeLabelKey] if machine.Status.CurrentStatus.PreserveExpiryTime == nil { - return false, nil + return machine, nil } // if there is no backing node if nodeName == "" { // remove annotation from machine if needed if removePreservationAnnotations { - var err error machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) if err != nil { - return false, err + return nil, err } } - machine, err := c.clearMachinePreserveExpiryTime(ctx, machine) + machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q with no backing node has stopped.", machine.Name) - return true, nil + return machine, nil } // Machine has a backing node @@ -2419,17 +2419,22 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al // if node is not found and error is simply returned, then preservation will never be stopped on machine // therefore, this error is handled specifically if apierrors.IsNotFound(err) { - // Node not found, proceed to clear preserveExpiryTime on machine - klog.Warningf("Node %q of machine %q not found. Proceeding to clear PreserveExpiryTime on machine.", nodeName, machine.Name) - _, err := c.clearMachinePreserveExpiryTime(ctx, machine) + klog.Warningf("Node %q of machine %q not found. Proceeding to stop preservation on machine.", nodeName, machine.Name) + if removePreservationAnnotations { + machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) + if err != nil { + return nil, err + } + } + machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q has stopped.", machine.Name) - return true, nil + return machine, nil } klog.Errorf("error trying to get node %q of machine %q: %v. Retrying.", nodeName, machine.Name, err) - return false, err + return nil, err } // prepare NodeCondition to set preservation as stopped preservedConditionFalse := v1.NodeCondition{ @@ -2441,27 +2446,33 @@ func (c *controller) stopPreservationIfActive(ctx context.Context, machine *v1al // Step 1: change node condition to reflect that preservation has stopped updatedNode, err := nodeops.AddOrUpdateConditionsOnNode(ctx, c.targetCoreClient, node.Name, preservedConditionFalse) if err != nil { - return false, err + return nil, err } // Step 2: remove annotations from node - err = c.removePreservationRelatedAnnotationsOnNode(ctx, updatedNode, removePreservationAnnotations) + updatedNode, err = c.removePreservationRelatedAnnotationsOnNode(ctx, updatedNode, removePreservationAnnotations) if err != nil { - return false, err + return nil, err } // Step 3: remove annotation from machine if needed if removePreservationAnnotations { machine, err = c.removePreserveAnnotationOnMachine(ctx, machine) if err != nil { - return false, err + return nil, err } } - // Step 4: update machine status to set preserve expiry time to nil + // Step 4: uncordon the node if the machine has recovered to Running. + if machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { + if err = c.uncordonNodeIfCordoned(ctx, updatedNode); err != nil { + return nil, err + } + } + // Step 5: update machine status to set preserve expiry time to nil machine, err = c.clearMachinePreserveExpiryTime(ctx, machine) if err != nil { - return false, err + return nil, err } klog.V(2).Infof("Preservation of machine %q with backing node %q has stopped.", machine.Name, nodeName) - return true, nil + return machine, nil } // setPreserveExpiryTimeOnMachine sets the PreserveExpiryTime on the machine object's Status.CurrentStatus to now + preserve timeout @@ -2515,7 +2526,7 @@ func computeNewNodePreservedCondition(currentStatus v1alpha1.CurrentStatus, pres newNodePreservedCondition.Message = fmt.Sprintf("%s %v.", preserveExpiryMessageSuffix, currentStatus.PreserveExpiryTime) needsUpdate = true } - if preserveValue == machineutils.PreserveMachineAnnotationValuePreservedByMCM { + if preserveValue == machineutils.PreserveMachineAnnotationValueAutoPreserved { newNodePreservedCondition.Reason = v1alpha1.PreservedByMCM } else { newNodePreservedCondition.Reason = v1alpha1.PreservedByUser diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index c180ddc44d..7e58a56b1a 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -4024,7 +4024,7 @@ var _ = Describe("machine_util", func() { c, trackers := createController(stop, testNamespace, controlMachineObjects, nil, targetCoreObjects, nil, false) defer trackers.Stop() waitForCacheSync(stop, c) - err := c.preserveMachine(context.TODO(), machine, tc.setup.preserveValue) + _, err := c.preserveMachine(context.TODO(), machine, tc.setup.preserveValue) if tc.expect.err == nil { Expect(err).To(BeNil()) } else { @@ -4097,24 +4097,6 @@ var _ = Describe("machine_util", func() { }, }, }), - Entry("when preserve=now, the machine has Failed, and the preservation is incomplete after step 1 - adding preserveExpiryTime", &testCase{ - setup: setup{ - machinePhase: machinev1.MachineFailed, - nodeName: "node-1", - preserveValue: machineutils.PreserveMachineAnnotationValueNow, - }, - expect: expect{ - err: nil, - isPreserveExpiryTimeSet: true, - isCAAnnotationPresent: true, - preserveNodeCondition: corev1.NodeCondition{ - Type: machinev1.NodePreserved, - Status: corev1.ConditionTrue, - Reason: machinev1.PreservedByUser, - Message: machinev1.PreservedNodeDrainSuccessful, - }, - }, - }), Entry("when preserve=now, the machine has Failed, and the preservation is incomplete at step 2 - adding CA annotations", &testCase{ setup: setup{ machinePhase: machinev1.MachineFailed, @@ -4181,7 +4163,7 @@ var _ = Describe("machine_util", func() { setup: setup{ machinePhase: machinev1.MachineFailed, nodeName: "node-1", - preserveValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + preserveValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, }, expect: expect{ err: nil, @@ -4442,7 +4424,7 @@ var _ = Describe("machine_util", func() { LastUpdateTime: metav1.Now(), PreserveExpiryTime: preserveExpiryTime, }, - preserveValue: machineutils.PreserveMachineAnnotationValuePreservedByMCM, + preserveValue: machineutils.PreserveMachineAnnotationValueAutoPreserved, drainErr: nil, existingNodeCondition: nil, }, @@ -4624,7 +4606,7 @@ var _ = Describe("machine_util", func() { c, trackers := createController(stop, testNamespace, nil, nil, targetCoreObjects, nil, false) defer trackers.Stop() waitForCacheSync(stop, c) - err := c.removePreservationRelatedAnnotationsOnNode(context.TODO(), node, tc.setup.removePreserveAnnotation) + _, err := c.removePreservationRelatedAnnotationsOnNode(context.TODO(), node, tc.setup.removePreserveAnnotation) waitForCacheSync(stop, c) updatedNode, getErr := c.targetCoreClient.CoreV1().Nodes().Get(context.TODO(), node.Name, metav1.GetOptions{}) Expect(getErr).To(BeNil()) diff --git a/pkg/util/provider/machinecontroller/node.go b/pkg/util/provider/machinecontroller/node.go index 5a2f77ff94..174805c7c1 100644 --- a/pkg/util/provider/machinecontroller/node.go +++ b/pkg/util/provider/machinecontroller/node.go @@ -340,40 +340,26 @@ func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []strin return false } -func (c *controller) getNodePreserveAnnotationValue(nodeName string) (string, error) { - node, err := c.nodeLister.Get(nodeName) - if err != nil { - if apierrors.IsNotFound(err) { - return "", nil - } - klog.Errorf("error fetching node %q: %v", nodeName, err) - return "", err - } - return node.Annotations[machineutils.PreserveMachineAnnotationKey], nil -} - -func (c *controller) uncordonNodeIfCordoned(ctx context.Context, nodeName string) error { - node, err := c.nodeLister.Get(nodeName) - if err != nil { - return err - } +func (c *controller) uncordonNodeIfCordoned(ctx context.Context, node *corev1.Node) error { if !node.Spec.Unschedulable { return nil } nodeClone := node.DeepCopy() nodeClone.Spec.Unschedulable = false - _, err = c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeClone, metav1.UpdateOptions{}) + _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeClone, metav1.UpdateOptions{}) if err != nil { - klog.Errorf("error uncordoning node %q: %v", nodeName, err) + klog.Errorf("error uncordoning node %q: %v", node.Name, err) + return err } - return err + klog.Infof("Successfully uncordoned node %q", node.Name) + return nil } // removePreservationRelatedAnnotationsOnNode removes the cluster-autoscaler annotation that disables scale down of preserved node -func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Context, node *corev1.Node, removePreserveAnnotation bool) error { +func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Context, node *corev1.Node, removePreserveAnnotation bool) (*corev1.Node, error) { // Check if annotation already absent if node.Annotations == nil { - return nil + return node, nil } updateRequired := false nodeCopy := node.DeepCopy() @@ -389,14 +375,14 @@ func (c *controller) removePreservationRelatedAnnotationsOnNode(ctx context.Cont updateRequired = true } if !updateRequired { - return nil + return node, nil } - _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeCopy, metav1.UpdateOptions{}) + updatedNode, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, nodeCopy, metav1.UpdateOptions{}) if err != nil { klog.Errorf("node UPDATE failed for node %q. Retrying, error: %s", node.Name, err) - return err + return nil, err } - return nil + return updatedNode, nil } // addCAScaleDownDisabledAnnotationOnNode adds the cluster-autoscaler annotation to disable scale down of preserved node diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index b56da7cc23..1c0f99af3d 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -7,6 +7,8 @@ package machineutils import ( "context" + "time" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" v1alpha1client "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1" v1alpha1listers "github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1" @@ -16,7 +18,6 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" - "time" ) const ( @@ -108,23 +109,19 @@ const ( // a Machine be preserved if and when it enters Failed phase PreserveMachineAnnotationValueWhenFailed = "when-failed" - // PreserveMachineAnnotationValuePreservedByMCM is the annotation value used by the machineset controller to + // PreserveMachineAnnotationValueAutoPreserved is the annotation value used by the machineset controller to // indicate to the machine controller that the machine must be auto-preserved. // The AutoPreserveFailedMachineMax, set on the MCD, is enforced based on the number of machines annotated with this value. - PreserveMachineAnnotationValuePreservedByMCM = "auto-preserved" + PreserveMachineAnnotationValueAutoPreserved = "auto-preserved" // PreserveMachineAnnotationValueFalse is the annotation value used to // 1) indicate to MCM that a machine must not be auto-preserved on failure - // and, 2) to stop auto-preservation of a machine that is already auto-preserved by MCM. + // and, 2) to stop preservation of a machine that is already preserved by MCM. PreserveMachineAnnotationValueFalse = "false" ) // AllowedPreserveAnnotationValues contains the allowed values for the preserve annotation -var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse, "") - -// PreventAutoPreserveAnnotationValues contains the values to check if a machine is already annotated for preservation, -// in which case it should not be auto-preserved. -var PreventAutoPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValuePreservedByMCM, PreserveMachineAnnotationValueFalse) +var AllowedPreserveAnnotationValues = sets.New(PreserveMachineAnnotationValueNow, PreserveMachineAnnotationValueWhenFailed, PreserveMachineAnnotationValueAutoPreserved, PreserveMachineAnnotationValueFalse) // RetryPeriod is an alias for specifying the retry period type RetryPeriod time.Duration