-
Notifications
You must be signed in to change notification settings - Fork 137
Fix preservation reconciliation to prevent deviation from regular flow for non-preservation-bound machines #1111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: rel-v0.62
Are you sure you want to change the base?
Changes from all commits
4db7dfc
60af619
cea40c2
de02bfd
a3b7133
daaa433
5f143c5
4188e83
5a57dd4
0d7c29b
5e258e0
0c147c9
e027119
200ce00
08b6cd3
cf6d15f
cb105e7
91b9476
5ab7c92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -744,22 +744,27 @@ func (c *controller) isCreationProcessing(machine *v1alpha1.Machine) bool { | |
| Machine Preservation operations | ||
| */ | ||
|
|
||
| // preserveStateInfo encapsulates the preservation annotation values found | ||
| // on the machine and node objects, along with the effective preservation value for the machine | ||
| // and the last applied node preserve value by MCM. | ||
| type preserveStateInfo struct { | ||
| nodeAnnotated bool | ||
| machineAnnotated bool | ||
| nodeValue string | ||
| machineValue string | ||
| lastAppliedNodeValue string | ||
| preserveExpiryTimeSet bool | ||
| } | ||
|
|
||
| // manageMachinePreservation manages machine preservation based on the preserve annotation values on the node and machine objects. | ||
| func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1alpha1.Machine) (retry machineutils.RetryPeriod, err error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can split this function such that one handles the case where
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will raise a separate PR for refactoring. Since the purpose of this is to play it safe and quickly deliver a fix without introducing any new regressions, I would like to leave this as-is for now. |
||
| machineAnnotationsSynced := false | ||
| clone := machine.DeepCopy() | ||
| defer func() { | ||
| // This needs to be done for cases when machine is neither preserved nor un-preserved (e.g. when preserve changes from now to when-failed on a Failed machine), | ||
| // but the LastAppliedNodePreserveValueAnnotation needs to be synced to the machine object. | ||
| // We compare annotation value in the clone with the original machine object to see if an update is required | ||
| if err == nil && !machineAnnotationsSynced && clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] != machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] { | ||
| _, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) | ||
| if err != nil { | ||
| klog.Errorf("error updating LastAppliedNodePreserveValueAnnotation value on machine %q: %v", machine.Name, err) | ||
| } | ||
| } | ||
| if err != nil { | ||
| if apierrors.IsConflict(err) { | ||
| if apierrors.IsNotFound(err) { | ||
| klog.Warningf("Error during preservation flow: %v", err.Error()) | ||
| retry = machineutils.LongRetry | ||
|
gagan16k marked this conversation as resolved.
|
||
| err = nil | ||
| } else if apierrors.IsConflict(err) { | ||
| retry = machineutils.ConflictRetry | ||
| } else { | ||
| retry = machineutils.ShortRetry | ||
|
|
@@ -768,76 +773,182 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a | |
| retry = machineutils.LongRetry | ||
| } | ||
| }() | ||
|
|
||
| nodeName := clone.Labels[v1alpha1.NodeLabelKey] | ||
| nodeAnnotationValue := "" | ||
| if nodeName != "" { | ||
| nodeAnnotationValue, err = c.getNodePreserveAnnotationValue(nodeName) | ||
| if err != nil { | ||
| return | ||
| } | ||
| nodeName := machine.Labels[v1alpha1.NodeLabelKey] | ||
| // We buffer the error returned here until we can tell if the machine is preservation-bound. | ||
| preserveInfo, getErr := c.getPreserveStateInfo(machine) | ||
| if preserveInfo.machineAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.machineValue) { | ||
| // If machine is annotated incorrectly, log and proceed as though machine is not annotated. | ||
| klog.Warningf("Preserve annotation %q=%q on machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.machineValue, machine.Name) | ||
| preserveInfo.machineAnnotated = false | ||
| preserveInfo.machineValue = "" | ||
| } | ||
| if preserveInfo.nodeAnnotated && !machineutils.AllowedPreserveAnnotationValues.Has(preserveInfo.nodeValue) { | ||
| klog.Warningf("Preserve annotation %q=%q on node %q backing machine %q is invalid", machineutils.PreserveMachineAnnotationKey, preserveInfo.nodeValue, nodeName, machine.Name) | ||
| return | ||
| } | ||
| var effectivePreserveValue string | ||
| effectivePreserveValue, clone.Annotations = getEffectivePreservationAnnotations(nodeAnnotationValue, clone.Annotations) | ||
| if !machineutils.AllowedPreserveAnnotationValues.Has(effectivePreserveValue) { | ||
| if effectivePreserveValue == nodeAnnotationValue { | ||
| klog.Warningf("Preserve annotation value %q on node %q is invalid", effectivePreserveValue, nodeName) | ||
| } else { | ||
| klog.Warningf("Preserve annotation value %q on machine %q is invalid", effectivePreserveValue, clone.Name) | ||
| } | ||
| preservationBound := isMachinePreservationBound(preserveInfo) | ||
| if !preservationBound { | ||
| // We clear the error here to prevent preservation logic from interfering with non-preservation-bound machines. | ||
| err = nil | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need to set
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. The earlier return defeats the purpose of this. I will probably need to buffer the error until we can determine whether or not the machine is preservation-bound. Thanks
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in cb105e7. The error from |
||
| return | ||
| } else if getErr != nil { | ||
| if !apierrors.IsNotFound(getErr) { | ||
| err = getErr | ||
| return | ||
| } | ||
| klog.Warningf("Couldn't find node %q for machine %q", nodeName, machine.Name) | ||
| err = nil | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can move this check right after
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We want to check if a machine is preservation-bound at all before we return an error. This is being done so that we don't alter the flow for machines that are not preservation-bound. This is the current issue that the PR is trying to solve. |
||
| // Note: when the backing node cannot be found, we assume the machine's annotation value needs to be enforced to enable | ||
| // preservation of the machine object. | ||
| effectivePreserveValue := getEffectivePreservationAnnotations(preserveInfo, getErr) | ||
|
|
||
| var removeAnnotations bool | ||
| clone := machine.DeepCopy() | ||
|
gagan16k marked this conversation as resolved.
|
||
| switch effectivePreserveValue { | ||
| case "", machineutils.PreserveMachineAnnotationValueFalse: | ||
| machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) | ||
| clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) | ||
| case machineutils.PreserveMachineAnnotationValueWhenFailed: | ||
| // on timing out, remove preserve annotation to prevent incorrect re-preservation | ||
| if machineutils.IsMachinePreservationExpired(clone) { | ||
| machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) | ||
| removeAnnotations = true | ||
| clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) | ||
| } else if !machineutils.IsMachineFailed(clone) { | ||
| machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false) | ||
| clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) | ||
| } else { | ||
| err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| } | ||
| case machineutils.PreserveMachineAnnotationValueNow: | ||
| if machineutils.IsMachinePreservationExpired(clone) { | ||
| // on timing out, remove preserve annotation to prevent incorrect re-preservation | ||
| machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) | ||
| removeAnnotations = true | ||
| clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) | ||
| } else { | ||
| err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| } | ||
| case machineutils.PreserveMachineAnnotationValuePreservedByMCM: | ||
| case machineutils.PreserveMachineAnnotationValueAutoPreserved: | ||
| if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) { | ||
| // To prevent incorrect re-preservation of a recovered, previously auto-preserved machine on future failures | ||
| // (since the autoPreserveFailedMachineCount maintained by the machineSetController, may have changed), | ||
| // in addition to stopping preservation, we also remove the preservation annotation on the machine. | ||
| machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true) | ||
| removeAnnotations = true | ||
| clone, err = c.stopPreservationIfActive(ctx, clone, removeAnnotations) | ||
| } else { | ||
| err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| clone, err = c.preserveMachine(ctx, clone, effectivePreserveValue) | ||
| } | ||
| } | ||
| if err != nil { | ||
| return | ||
| } | ||
| // This is to handle the case where a preserved machine recovers from Failed to Running | ||
| // in which case, pods should be allowed to be scheduled onto the node | ||
| if machineutils.IsMachineActive(clone) && nodeName != "" { | ||
| err = c.uncordonNodeIfCordoned(ctx, nodeName) | ||
| // For the preserve=now path (preservation still active), uncordon the node if the machine | ||
| // has recovered to Running. For all other cases, stopPreservationIfActive path handles uncordon internally. | ||
| if clone.Status.CurrentStatus.PreserveExpiryTime != nil && clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning && nodeName != "" { | ||
| var node *corev1.Node | ||
| node, err = c.nodeLister.Get(nodeName) | ||
| if err != nil { | ||
| if apierrors.IsNotFound(err) { | ||
| err = nil | ||
| return | ||
| } | ||
| return | ||
| } | ||
| err = c.uncordonNodeIfCordoned(ctx, node) | ||
| if err != nil { | ||
| return | ||
| } | ||
| } | ||
|
|
||
| if shouldAnnotationsBeUpdatedOnMachine(removeAnnotations, preserveInfo) { | ||
| err = c.updatePreserveAnnotationOnMachine(ctx, preserveInfo.nodeValue, clone) | ||
| } | ||
| return | ||
| } | ||
|
|
||
| // getEffectivePreservationAnnotations returns the effective preservation value, and updates the machine Annotations related to preservation | ||
| func getEffectivePreservationAnnotations(nodeAnnotationValue string, machineAnnotations map[string]string) (string, map[string]string) { | ||
| // getEffectivePreservationAnnotations returns the effective preservation value. | ||
| // | ||
| // If there is no active node annotation AND no previously-applied node annotation, | ||
| // enforce machine's preserve annotation. | ||
| // Otherwise, the node annotation takes precedence (even if now empty/removed). | ||
| // | ||
| // lastAppliedNodeValue is required to handle the following scenario: | ||
| // | ||
| // T1: Node and Machine both have the same annotation with the same value. (MCM is up and running). | ||
| // T2 (T2 > T1): MCM went down. | ||
| // T3 (T3 > T2): Node annotation was removed. | ||
| // T4 (T4 > T3): MCM came back up. | ||
| // At T4 it sees a Node with no preserve annotation but a Machine with a preserve annotation. | ||
| // It continues to preserve the machine. | ||
| func getEffectivePreservationAnnotations(info *preserveStateInfo, getPreserveStateErr error) string { | ||
| // If the node cannot be found, nodeValue is "". | ||
| // In this case, we want the machine's annotation value to be enforced. | ||
| if apierrors.IsNotFound(getPreserveStateErr) { | ||
| return info.machineValue | ||
| } | ||
| // If there is no active node annotation AND no previously-applied node annotation, | ||
| // enforce machine's preserve annotation. | ||
| // Otherwise, the node annotation takes precedence (even if now empty/removed). | ||
| if nodeAnnotationValue == "" && machineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] == "" { | ||
| return machineAnnotations[machineutils.PreserveMachineAnnotationKey], machineAnnotations | ||
| } | ||
| clonedMachineAnnotations := make(map[string]string) | ||
| maps.Copy(clonedMachineAnnotations, machineAnnotations) | ||
| delete(clonedMachineAnnotations, machineutils.PreserveMachineAnnotationKey) | ||
| clonedMachineAnnotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeAnnotationValue | ||
| return nodeAnnotationValue, clonedMachineAnnotations | ||
| if info.nodeValue == "" && info.lastAppliedNodeValue == "" { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we have to also check for
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check the comment above (sorry, it's long). It gives an example scenario of why lastAppliedNodeValue is needed. If MCM goes down, we won't be able to tell if node was never annotated or if node annotation was deleted. |
||
| return info.machineValue | ||
| } | ||
| return info.nodeValue | ||
| } | ||
|
|
||
| func isMachinePreservationBound(info *preserveStateInfo) bool { | ||
| // if machine has no preservation state, the machine is not preservation-bound | ||
| if !info.preserveExpiryTimeSet && !info.nodeAnnotated && !info.machineAnnotated && info.lastAppliedNodeValue == "" { | ||
|
r4mek marked this conversation as resolved.
|
||
| return false | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| func (c *controller) getPreserveStateInfo(machine *v1alpha1.Machine) (*preserveStateInfo, error) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please consider writing some unit tests for the new helper methods that you introduced
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in cb105e7 |
||
| var info preserveStateInfo | ||
| if machine.Annotations != nil { | ||
| info.machineValue, info.machineAnnotated = machine.Annotations[machineutils.PreserveMachineAnnotationKey] | ||
| info.lastAppliedNodeValue = machine.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] | ||
| } | ||
| if machine.Status.CurrentStatus.PreserveExpiryTime != nil { | ||
| info.preserveExpiryTimeSet = true | ||
| } | ||
| nodeName := machine.Labels[v1alpha1.NodeLabelKey] | ||
| if nodeName != "" { | ||
| node, err := c.nodeLister.Get(nodeName) | ||
| if err != nil { | ||
| return &info, err | ||
| } | ||
| info.nodeValue, info.nodeAnnotated = node.Annotations[machineutils.PreserveMachineAnnotationKey] | ||
|
gagan16k marked this conversation as resolved.
|
||
| } | ||
| return &info, nil | ||
| } | ||
|
|
||
| // shouldAnnotationsBeUpdatedOnMachine returns true when the machine's annotation tracking needs | ||
| // to be synced after a preservation action. | ||
| func shouldAnnotationsBeUpdatedOnMachine(removeAnnotations bool, preserveInfo *preserveStateInfo) bool { | ||
| // annotations were already removed by stopPreservationIfActive — nothing left to sync | ||
| if removeAnnotations { | ||
| return false | ||
| } | ||
| // node annotation is not in control — machine annotation prevails, no sync needed | ||
| if !preserveInfo.nodeAnnotated && preserveInfo.lastAppliedNodeValue == "" { | ||
| return false | ||
| } | ||
| // node value is unchanged and machine has no annotation to clear — nothing has changed | ||
| if preserveInfo.nodeValue == preserveInfo.lastAppliedNodeValue && !preserveInfo.machineAnnotated { | ||
| return false | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| // updatePreserveAnnotationOnMachine clears the machine's PreserveMachineAnnotationKey and sets | ||
| // [machineutils.LastAppliedNodePreserveValueAnnotationKey] to nodeValue. | ||
| func (c *controller) updatePreserveAnnotationOnMachine(ctx context.Context, nodeValue string, machine *v1alpha1.Machine) error { | ||
| clone := machine.DeepCopy() | ||
| if clone.Annotations == nil { | ||
| clone.Annotations = make(map[string]string) | ||
| } else { | ||
| delete(clone.Annotations, machineutils.PreserveMachineAnnotationKey) | ||
| } | ||
| clone.Annotations[machineutils.LastAppliedNodePreserveValueAnnotationKey] = nodeValue | ||
| _, err := c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) | ||
| return err | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since
""annotation value is not valid anymore, we can removenodeAnnotatedandmachineAnnotatedbooleans. Annotation value is""implies that annotation is not set.Also, can we rename
nodeValue,machineValuetonodeAnnotationValue,machineAnnotationValueresp.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only way to distinguish between node/machine being annotated invalidly with "", or whether the annotation is unset(valid- means deletion of annotation), is by checking nodeAnnotated/machineAnnotated.