Skip to content

Commit a88faa8

Browse files
Add in-place updates support for machine controller
Signed-off-by: Alexandr Demicev <[email protected]>
1 parent 82f5743 commit a88faa8

File tree

6 files changed

+297
-7
lines changed

6 files changed

+297
-7
lines changed

api/core/v1beta2/common_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ const (
181181
// This annotation can be used to inform MachinePool status during in-progress scaling scenarios.
182182
ReplicasManagedByAnnotation = "cluster.x-k8s.io/replicas-managed-by"
183183

184+
// InPlaceUpdateInProgressAnnotation is set on Machine, InfraMachine, and BootstrapConfig when an in-place update is in progress.
185+
// The Machine controller waits for all three objects to have this annotation before starting the update.
186+
// The Machine controller removes this annotation when the update is complete.
187+
InPlaceUpdateInProgressAnnotation = "cluster.x-k8s.io/in-place-update-in-progress"
188+
184189
// AutoscalerMinSizeAnnotation defines the minimum node group size.
185190
// The annotation is used by autoscaler.
186191
// The annotation is copied from kubernetes/autoscaler.

api/core/v1beta2/machine_types.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,31 @@ const (
153153
MachineNotUpToDateReason = "NotUpToDate"
154154
)
155155

156+
// Machine's Updating condition and corresponding reasons.
157+
// Note: Updating condition is set by the Machine controller during in-place updates.
158+
const (
159+
// MachineUpdatingCondition is true while an in-place update is in progress on the Machine.
160+
// The condition is owned by the Machine controller and is used to track the progress of in-place updates.
161+
// This condition is considered when computing the UpToDate condition.
162+
MachineUpdatingCondition = "Updating"
163+
164+
// MachineNotUpdatingReason surfaces when the Machine is not performing an in-place update.
165+
MachineNotUpdatingReason = "NotUpdating"
166+
167+
// MachineWaitingForInPlaceUpdateAnnotationsReason surfaces when the Machine is waiting for
168+
// InfraMachine and BootstrapConfig to be annotated for in-place update.
169+
MachineWaitingForInPlaceUpdateAnnotationsReason = "WaitingForInPlaceUpdateAnnotations"
170+
171+
// MachineWaitingForUpdateMachineHookReason surfaces when the Machine is waiting for the UpdateMachine hook to complete.
172+
MachineWaitingForUpdateMachineHookReason = "WaitingForUpdateMachineHook"
173+
174+
// MachineUpdateFailedReason surfaces when the in-place update has failed.
175+
MachineUpdateFailedReason = "UpdateFailed"
176+
177+
// MachineUpdatingInternalErrorReason surfaces unexpected failures during in-place update.
178+
MachineUpdatingInternalErrorReason = InternalErrorReason
179+
)
180+
156181
// Machine's BootstrapConfigReady condition and corresponding reasons.
157182
// Note: when possible, BootstrapConfigReady condition will use reasons surfaced from the underlying bootstrap config object.
158183
const (

controllers/alias.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ func (r *ClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
7272

7373
// MachineReconciler reconciles a Machine object.
7474
type MachineReconciler struct {
75-
Client client.Client
76-
APIReader client.Reader
77-
ClusterCache clustercache.ClusterCache
75+
Client client.Client
76+
APIReader client.Reader
77+
ClusterCache clustercache.ClusterCache
78+
RuntimeClient runtimeclient.Client
7879

7980
// WatchFilterValue is the label value used to filter events prior to reconciliation.
8081
WatchFilterValue string
@@ -90,6 +91,7 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
9091
Client: r.Client,
9192
APIReader: r.APIReader,
9293
ClusterCache: r.ClusterCache,
94+
RuntimeClient: r.RuntimeClient,
9395
WatchFilterValue: r.WatchFilterValue,
9496
RemoteConditionsGracePeriod: r.RemoteConditionsGracePeriod,
9597
AdditionalSyncMachineLabels: r.AdditionalSyncMachineLabels,

internal/controllers/machine/machine_controller.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
"sigs.k8s.io/cluster-api/controllers/clustercache"
5353
"sigs.k8s.io/cluster-api/controllers/external"
5454
"sigs.k8s.io/cluster-api/controllers/noderefutil"
55+
runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client"
5556
"sigs.k8s.io/cluster-api/feature"
5657
"sigs.k8s.io/cluster-api/internal/contract"
5758
"sigs.k8s.io/cluster-api/internal/controllers/machine/drain"
@@ -93,9 +94,10 @@ var (
9394

9495
// Reconciler reconciles a Machine object.
9596
type Reconciler struct {
96-
Client client.Client
97-
APIReader client.Reader
98-
ClusterCache clustercache.ClusterCache
97+
Client client.Client
98+
APIReader client.Reader
99+
ClusterCache clustercache.ClusterCache
100+
RuntimeClient runtimeclient.Client
99101

100102
// WatchFilterValue is the label value used to filter events prior to reconciliation.
101103
WatchFilterValue string
@@ -129,6 +131,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
129131
// to have some buffer.
130132
return errors.New("Client, APIReader and ClusterCache must not be nil and RemoteConditionsGracePeriod must not be < 2m")
131133
}
134+
if feature.Gates.Enabled(feature.InPlaceUpdates) && r.RuntimeClient == nil {
135+
return errors.New("RuntimeClient must not be nil when InPlaceUpdates feature gate is enabled")
136+
}
132137

133138
r.predicateLog = ptr.To(ctrl.LoggerFrom(ctx).WithValues("controller", "machine"))
134139
clusterToMachines, err := util.ClusterToTypedObjectsMapper(mgr.GetClient(), &clusterv1.MachineList{}, mgr.GetScheme())
@@ -282,7 +287,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
282287
}
283288

284289
// Handle normal reconciliation loop.
285-
return doReconcile(ctx, alwaysReconcile, s)
290+
reconcileNormal := append(
291+
alwaysReconcile,
292+
r.reconcileInPlaceUpdate,
293+
)
294+
295+
return doReconcile(ctx, reconcileNormal, s)
286296
}
287297

288298
func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -326,6 +336,7 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
326336
clusterv1.MachineNodeReadyCondition,
327337
clusterv1.MachineNodeHealthyCondition,
328338
clusterv1.MachineDeletingCondition,
339+
clusterv1.MachineUpdatingCondition,
329340
}},
330341
)
331342

@@ -397,6 +408,12 @@ type scope struct {
397408

398409
// deletingMessage is the message that should be used when setting the Deleting condition.
399410
deletingMessage string
411+
412+
// updatingReason is the reason that should be used when setting the Updating condition.
413+
updatingReason string
414+
415+
// updatingMessage is the message that should be used when setting the Updating condition.
416+
updatingMessage string
400417
}
401418

402419
func (r *Reconciler) reconcileMachineOwnerAndLabels(_ context.Context, s *scope) (ctrl.Result, error) {
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package machine
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
"github.com/pkg/errors"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/klog/v2"
29+
ctrl "sigs.k8s.io/controller-runtime"
30+
"sigs.k8s.io/controller-runtime/pkg/client"
31+
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
32+
33+
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
34+
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
35+
"sigs.k8s.io/cluster-api/feature"
36+
"sigs.k8s.io/cluster-api/internal/hooks"
37+
)
38+
39+
// reconcileInPlaceUpdate handles the in-place update workflow for a Machine.
40+
func (r *Reconciler) reconcileInPlaceUpdate(ctx context.Context, s *scope) (ctrl.Result, error) {
41+
if !feature.Gates.Enabled(feature.InPlaceUpdates) {
42+
return ctrl.Result{}, nil
43+
}
44+
45+
log := ctrl.LoggerFrom(ctx)
46+
47+
machineAnnotations := s.machine.GetAnnotations()
48+
_, inPlaceUpdateInProgress := machineAnnotations[clusterv1.InPlaceUpdateInProgressAnnotation]
49+
hasUpdateMachinePending := hooks.IsPending(runtimehooksv1.UpdateMachine, s.machine)
50+
51+
if !inPlaceUpdateInProgress {
52+
// Clean up any orphaned pending hooks and annotations before exiting.
53+
// This can happen if the in-place update annotation was removed from Machine
54+
// but the UpdateMachine hook is still pending or annotations are still on InfraMachine/BootstrapConfig.
55+
if hasUpdateMachinePending {
56+
log.Info("In-place update annotation removed but UpdateMachine hook still pending, cleaning up orphaned hook and annotations")
57+
if err := r.completeInPlaceUpdate(ctx, s); err != nil {
58+
return ctrl.Result{}, errors.Wrap(err, "failed to clean up orphaned UpdateMachine hook and annotations")
59+
}
60+
}
61+
62+
return ctrl.Result{}, nil
63+
}
64+
65+
// If hook is not pending, we're waiting for the owner controller to mark it as pending.
66+
if !hasUpdateMachinePending {
67+
log.Info("In-place update annotations are set, waiting for UpdateMachine hook to be marked as pending")
68+
return ctrl.Result{}, nil
69+
}
70+
71+
if s.infraMachine == nil {
72+
s.updatingReason = clusterv1.MachineUpdateFailedReason
73+
s.updatingMessage = "In-place update not possible: InfraMachine not found"
74+
return ctrl.Result{}, errors.New("in-place update failed: InfraMachine not found")
75+
}
76+
77+
infraReady := r.isInfraMachineReadyForUpdate(s)
78+
bootstrapReady := r.isBootstrapConfigReadyForUpdate(s)
79+
80+
if !infraReady || !bootstrapReady {
81+
log.Info("Waiting for InfraMachine and BootstrapConfig to be marked for in-place update")
82+
return ctrl.Result{}, nil
83+
}
84+
85+
if hasUpdateMachinePending {
86+
log.Info("UpdateMachine hook is pending, calling runtime hook")
87+
result, err := r.callUpdateMachineHook(ctx, s)
88+
if err != nil {
89+
s.updatingReason = clusterv1.MachineUpdateFailedReason
90+
s.updatingMessage = fmt.Sprintf("UpdateMachine hook failed: %v", err)
91+
return ctrl.Result{}, err
92+
}
93+
94+
if result.RequeueAfter > 0 {
95+
s.updatingReason = clusterv1.MachineWaitingForUpdateMachineHookReason
96+
s.updatingMessage = "UpdateMachine hook in progress"
97+
return result, nil
98+
}
99+
100+
log.Info("In-place update completed successfully")
101+
if err := r.completeInPlaceUpdate(ctx, s); err != nil {
102+
return ctrl.Result{}, errors.Wrap(err, "failed to complete in-place update")
103+
}
104+
105+
return ctrl.Result{}, nil
106+
}
107+
108+
return ctrl.Result{}, nil
109+
}
110+
111+
// isInfraMachineReadyForUpdate checks if the InfraMachine has the in-place update annotation.
112+
func (r *Reconciler) isInfraMachineReadyForUpdate(s *scope) bool {
113+
_, hasAnnotation := s.infraMachine.GetAnnotations()[clusterv1.InPlaceUpdateInProgressAnnotation]
114+
return hasAnnotation
115+
}
116+
117+
// isBootstrapConfigReadyForUpdate checks if the BootstrapConfig has the in-place update annotation.
118+
func (r *Reconciler) isBootstrapConfigReadyForUpdate(s *scope) bool {
119+
if s.bootstrapConfig == nil {
120+
return true
121+
}
122+
_, hasAnnotation := s.bootstrapConfig.GetAnnotations()[clusterv1.InPlaceUpdateInProgressAnnotation]
123+
return hasAnnotation
124+
}
125+
126+
// callUpdateMachineHook calls the UpdateMachine runtime hook for the machine.
127+
func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl.Result, error) {
128+
log := ctrl.LoggerFrom(ctx)
129+
130+
request := &runtimehooksv1.UpdateMachineRequest{
131+
Desired: runtimehooksv1.UpdateMachineRequestObjects{
132+
Machine: *cleanupMachine(s.machine),
133+
InfrastructureMachine: runtime.RawExtension{Object: cleanupUnstructured(s.infraMachine)},
134+
},
135+
}
136+
137+
if s.bootstrapConfig != nil {
138+
request.Desired.BootstrapConfig = runtime.RawExtension{Object: cleanupUnstructured(s.bootstrapConfig)}
139+
}
140+
141+
response := &runtimehooksv1.UpdateMachineResponse{}
142+
143+
if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.UpdateMachine, s.machine, request, response); err != nil {
144+
return ctrl.Result{}, errors.Wrap(err, "failed to call UpdateMachine hook")
145+
}
146+
147+
if response.GetRetryAfterSeconds() != 0 {
148+
log.Info(fmt.Sprintf("UpdateMachine hook requested retry after %d seconds", response.GetRetryAfterSeconds()))
149+
return ctrl.Result{RequeueAfter: time.Duration(response.GetRetryAfterSeconds()) * time.Second}, nil
150+
}
151+
152+
log.Info("UpdateMachine hook completed successfully")
153+
return ctrl.Result{}, nil
154+
}
155+
156+
// completeInPlaceUpdate removes in-place update annotations from InfraMachine, BootstrapConfig, Machine,
157+
// and then marks the UpdateMachine hook as done (removes it from pending-hooks annotation).
158+
func (r *Reconciler) completeInPlaceUpdate(ctx context.Context, s *scope) error {
159+
log := ctrl.LoggerFrom(ctx)
160+
161+
if s.infraMachine == nil {
162+
return errors.New("InfraMachine must exist to complete in-place update")
163+
}
164+
165+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.infraMachine); err != nil {
166+
return errors.Wrap(err, "failed to remove in-place update annotation from InfraMachine")
167+
}
168+
169+
if s.bootstrapConfig != nil {
170+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.bootstrapConfig); err != nil {
171+
return errors.Wrap(err, "failed to remove in-place update annotation from BootstrapConfig")
172+
}
173+
}
174+
175+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.machine); err != nil {
176+
return errors.Wrap(err, "failed to remove in-place update annotation from Machine")
177+
}
178+
179+
if err := hooks.MarkAsDone(ctx, r.Client, s.machine, runtimehooksv1.UpdateMachine); err != nil {
180+
return errors.Wrap(err, "failed to mark UpdateMachine hook as done")
181+
}
182+
183+
log.Info("Removed in-place update annotations and marked hook as done")
184+
return nil
185+
}
186+
187+
// removeInPlaceUpdateAnnotation removes the in-place update annotation from an object and patches it immediately.
188+
func (r *Reconciler) removeInPlaceUpdateAnnotation(ctx context.Context, obj client.Object) error {
189+
annotations := obj.GetAnnotations()
190+
if _, exists := annotations[clusterv1.InPlaceUpdateInProgressAnnotation]; !exists {
191+
return nil
192+
}
193+
194+
gvk, err := apiutil.GVKForObject(obj, r.Client.Scheme())
195+
if err != nil {
196+
return errors.Wrapf(err, "failed to remove %s annotation from object %s", clusterv1.InPlaceUpdateInProgressAnnotation, klog.KObj(obj))
197+
}
198+
199+
orig := obj.DeepCopyObject().(client.Object)
200+
delete(annotations, clusterv1.InPlaceUpdateInProgressAnnotation)
201+
obj.SetAnnotations(annotations)
202+
203+
if err := r.Client.Patch(ctx, obj, client.MergeFrom(orig)); err != nil {
204+
return errors.Wrapf(err, "failed to remove %s annotation from %s %s", clusterv1.InPlaceUpdateInProgressAnnotation, gvk.Kind, klog.KObj(obj))
205+
}
206+
207+
return nil
208+
}
209+
210+
func cleanupMachine(machine *clusterv1.Machine) *clusterv1.Machine {
211+
return &clusterv1.Machine{
212+
// Set GVK because object is later marshalled with json.Marshal when the hook request is sent.
213+
TypeMeta: metav1.TypeMeta{
214+
APIVersion: clusterv1.GroupVersion.String(),
215+
Kind: "Machine",
216+
},
217+
ObjectMeta: metav1.ObjectMeta{
218+
Name: machine.Name,
219+
Namespace: machine.Namespace,
220+
Labels: machine.Labels,
221+
Annotations: machine.Annotations,
222+
},
223+
Spec: *machine.Spec.DeepCopy(),
224+
}
225+
}
226+
227+
func cleanupUnstructured(u *unstructured.Unstructured) *unstructured.Unstructured {
228+
cleanedUpU := &unstructured.Unstructured{
229+
Object: map[string]interface{}{
230+
"apiVersion": u.GetAPIVersion(),
231+
"kind": u.GetKind(),
232+
"spec": u.Object["spec"],
233+
},
234+
}
235+
cleanedUpU.SetName(u.GetName())
236+
cleanedUpU.SetNamespace(u.GetNamespace())
237+
cleanedUpU.SetLabels(u.GetLabels())
238+
cleanedUpU.SetAnnotations(u.GetAnnotations())
239+
return cleanedUpU
240+
}

main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map
672672
Client: mgr.GetClient(),
673673
APIReader: mgr.GetAPIReader(),
674674
ClusterCache: clusterCache,
675+
RuntimeClient: runtimeClient,
675676
WatchFilterValue: watchFilterValue,
676677
RemoteConditionsGracePeriod: remoteConditionsGracePeriod,
677678
AdditionalSyncMachineLabels: additionalSyncMachineLabelRegexes,

0 commit comments

Comments
 (0)