From 77e077f315e1b6f56ac9b6d4f9c3aa88a57c05b6 Mon Sep 17 00:00:00 2001 From: ontave Date: Wed, 6 May 2026 20:54:01 +0200 Subject: [PATCH] feat: machineconfig-restore CRD + reconciler + etcd/mcb schedule reconcilers (session/25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds TalosMachineConfigRestore CRD and reconciler mirroring the backup pattern: gates on RunnerConfig/capability, submits Conductor Job, polls OperationResult. Adds TalosMachineConfigBackupSchedule and TalosEtcdBackupSchedule CRDs with interval-based schedule reconcilers that create backup CRs on Go duration intervals using RequeueAfter. Registers all three reconcilers in main. platform-schema.md §11. --- api/v1alpha1/etcdbackupschedule_types.go | 99 +++++ .../machineconfigbackupschedule_types.go | 99 +++++ api/v1alpha1/machineconfigrestore_types.go | 134 +++++++ api/v1alpha1/zz_generated.deepcopy.go | 339 ++++++++++++++++++ cmd/platform/main.go | 26 ++ .../etcdbackupschedule_reconciler.go | 143 ++++++++ .../machineconfigbackupschedule_reconciler.go | 136 +++++++ .../machineconfigrestore_reconciler.go | 276 ++++++++++++++ 8 files changed, 1252 insertions(+) create mode 100644 api/v1alpha1/etcdbackupschedule_types.go create mode 100644 api/v1alpha1/machineconfigbackupschedule_types.go create mode 100644 api/v1alpha1/machineconfigrestore_types.go create mode 100644 internal/controller/etcdbackupschedule_reconciler.go create mode 100644 internal/controller/machineconfigbackupschedule_reconciler.go create mode 100644 internal/controller/machineconfigrestore_reconciler.go diff --git a/api/v1alpha1/etcdbackupschedule_types.go b/api/v1alpha1/etcdbackupschedule_types.go new file mode 100644 index 0000000..a3ebdb0 --- /dev/null +++ b/api/v1alpha1/etcdbackupschedule_types.go @@ -0,0 +1,99 @@ +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Condition type and reason constants for TalosEtcdBackupSchedule. +const ( + // ConditionTypeEtcdBackupScheduleActive indicates the schedule is active. + ConditionTypeEtcdBackupScheduleActive = "Active" + + // ReasonEtcdBackupScheduleNextRunPending is set while waiting for the next run. + ReasonEtcdBackupScheduleNextRunPending = "NextRunPending" + + // ReasonEtcdBackupScheduleRunning is set while an EtcdMaintenance CR is being created. + ReasonEtcdBackupScheduleRunning = "Running" + + // ReasonEtcdBackupScheduleParseError is set when the schedule duration cannot be parsed. + ReasonEtcdBackupScheduleParseError = "ParseError" +) + +// TalosEtcdBackupScheduleSpec defines the desired state of TalosEtcdBackupSchedule. +type TalosEtcdBackupScheduleSpec struct { + // ClusterRef references the TalosCluster to back up on schedule. + ClusterRef LocalObjectRef `json:"clusterRef"` + + // Schedule is the backup interval as a Go duration string (e.g., "24h", "6h"). + // The reconciler creates a new EtcdMaintenance CR with operation=backup each time + // the interval elapses. + Schedule string `json:"schedule"` + + // S3Destination is the S3 location to write etcd snapshots to. + S3Destination S3Ref `json:"s3Destination"` + + // EtcdBackupS3SecretRef references a Secret containing S3 backup credentials. + // Falls back to seam-etcd-backup-config in seam-system when absent. + // platform-schema.md §10. + // +optional + EtcdBackupS3SecretRef *corev1.SecretReference `json:"etcdBackupS3SecretRef,omitempty"` +} + +// TalosEtcdBackupScheduleStatus defines the observed state of TalosEtcdBackupSchedule. +type TalosEtcdBackupScheduleStatus struct { + // ObservedGeneration is the generation of the spec last reconciled. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // NextRunAt is the time the next EtcdMaintenance CR will be created. + // +optional + NextRunAt *metav1.Time `json:"nextRunAt,omitempty"` + + // LastRunAt is the time the most recent EtcdMaintenance CR was created. + // +optional + LastRunAt *metav1.Time `json:"lastRunAt,omitempty"` + + // LastBackupName is the name of the most recently created EtcdMaintenance CR. + // +optional + LastBackupName string `json:"lastBackupName,omitempty"` + + // Conditions is the list of status conditions. + // +optional + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// TalosEtcdBackupSchedule creates EtcdMaintenance CRs with operation=backup on a +// repeating interval. The schedule field accepts Go duration strings (e.g. "24h"). +// platform-schema.md §10. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=etcdbs +// +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=".spec.clusterRef.name" +// +kubebuilder:printcolumn:name="Schedule",type=string,JSONPath=".spec.schedule" +// +kubebuilder:printcolumn:name="NextRun",type=date,JSONPath=".status.nextRunAt" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" +type TalosEtcdBackupSchedule struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TalosEtcdBackupScheduleSpec `json:"spec,omitempty"` + Status TalosEtcdBackupScheduleStatus `json:"status,omitempty"` +} + +// TalosEtcdBackupScheduleList is the list type. +// +// +kubebuilder:object:root=true +type TalosEtcdBackupScheduleList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []TalosEtcdBackupSchedule `json:"items"` +} + +func init() { + SchemeBuilder.Register(&TalosEtcdBackupSchedule{}, &TalosEtcdBackupScheduleList{}) +} diff --git a/api/v1alpha1/machineconfigbackupschedule_types.go b/api/v1alpha1/machineconfigbackupschedule_types.go new file mode 100644 index 0000000..5d09e23 --- /dev/null +++ b/api/v1alpha1/machineconfigbackupschedule_types.go @@ -0,0 +1,99 @@ +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Condition type and reason constants for TalosMachineConfigBackupSchedule. +const ( + // ConditionTypeMCBScheduleActive indicates the schedule is active and will create backups. + ConditionTypeMCBScheduleActive = "Active" + + // ReasonMCBScheduleNextRunPending is set while waiting for the next scheduled run. + ReasonMCBScheduleNextRunPending = "NextRunPending" + + // ReasonMCBScheduleRunning is set while a backup CR is being created. + ReasonMCBScheduleRunning = "Running" + + // ReasonMCBScheduleParseError is set when the schedule duration cannot be parsed. + ReasonMCBScheduleParseError = "ParseError" +) + +// TalosMachineConfigBackupScheduleSpec defines the desired state of TalosMachineConfigBackupSchedule. +type TalosMachineConfigBackupScheduleSpec struct { + // ClusterRef references the TalosCluster to back up on schedule. + ClusterRef LocalObjectRef `json:"clusterRef"` + + // Schedule is the backup interval as a Go duration string (e.g., "24h", "6h", "1h"). + // The reconciler creates a new TalosMachineConfigBackup CR each time the interval elapses. + Schedule string `json:"schedule"` + + // S3Destination is the S3 location to write node machine configs to. + // The bucket is required. + S3Destination S3Ref `json:"s3Destination"` + + // S3BackupSecretRef references a Secret containing S3 backup credentials. + // Falls back to seam-etcd-backup-config in seam-system when absent. + // platform-schema.md §10. + // +optional + S3BackupSecretRef *corev1.SecretReference `json:"s3BackupSecretRef,omitempty"` +} + +// TalosMachineConfigBackupScheduleStatus defines the observed state of TalosMachineConfigBackupSchedule. +type TalosMachineConfigBackupScheduleStatus struct { + // ObservedGeneration is the generation of the spec last reconciled. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // NextRunAt is the time the next backup CR will be created. + // +optional + NextRunAt *metav1.Time `json:"nextRunAt,omitempty"` + + // LastRunAt is the time the most recent backup CR was created. + // +optional + LastRunAt *metav1.Time `json:"lastRunAt,omitempty"` + + // LastBackupName is the name of the most recently created TalosMachineConfigBackup CR. + // +optional + LastBackupName string `json:"lastBackupName,omitempty"` + + // Conditions is the list of status conditions. + // +optional + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// TalosMachineConfigBackupSchedule creates TalosMachineConfigBackup CRs on a repeating +// interval. The schedule field accepts Go duration strings (e.g. "24h"). +// platform-schema.md §11. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=mcbs +// +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=".spec.clusterRef.name" +// +kubebuilder:printcolumn:name="Schedule",type=string,JSONPath=".spec.schedule" +// +kubebuilder:printcolumn:name="NextRun",type=date,JSONPath=".status.nextRunAt" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" +type TalosMachineConfigBackupSchedule struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TalosMachineConfigBackupScheduleSpec `json:"spec,omitempty"` + Status TalosMachineConfigBackupScheduleStatus `json:"status,omitempty"` +} + +// TalosMachineConfigBackupScheduleList is the list type. +// +// +kubebuilder:object:root=true +type TalosMachineConfigBackupScheduleList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []TalosMachineConfigBackupSchedule `json:"items"` +} + +func init() { + SchemeBuilder.Register(&TalosMachineConfigBackupSchedule{}, &TalosMachineConfigBackupScheduleList{}) +} diff --git a/api/v1alpha1/machineconfigrestore_types.go b/api/v1alpha1/machineconfigrestore_types.go new file mode 100644 index 0000000..7586045 --- /dev/null +++ b/api/v1alpha1/machineconfigrestore_types.go @@ -0,0 +1,134 @@ +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/ontai-dev/seam-core/pkg/lineage" +) + +// Condition type and reason constants for TalosMachineConfigRestore. +const ( + // ConditionTypeMachineConfigRestoreReady indicates the restore completed successfully. + ConditionTypeMachineConfigRestoreReady = "Ready" + + // ConditionTypeMachineConfigRestoreRunning indicates the Conductor Job is running. + ConditionTypeMachineConfigRestoreRunning = "Running" + + // ConditionTypeMachineConfigRestoreDegraded indicates the restore failed. + ConditionTypeMachineConfigRestoreDegraded = "Degraded" + + // ReasonMachineConfigRestoreJobSubmitted is set when the Conductor executor Job is submitted. + ReasonMachineConfigRestoreJobSubmitted = "JobSubmitted" + + // ReasonMachineConfigRestoreJobComplete is set when the Job completed successfully. + ReasonMachineConfigRestoreJobComplete = "JobComplete" + + // ReasonMachineConfigRestoreJobFailed is set when the Job failed. INV-018 applies. + ReasonMachineConfigRestoreJobFailed = "JobFailed" + + // ReasonMachineConfigRestoreS3Absent indicates no S3 source is configured. + ReasonMachineConfigRestoreS3Absent = "S3SourceAbsent" + + // ConditionTypeMachineConfigRestoreS3Absent is the condition type for absent S3 config. + ConditionTypeMachineConfigRestoreS3Absent = "S3SourceAbsent" +) + +// TalosMachineConfigRestoreSpec defines the desired state of TalosMachineConfigRestore. +type TalosMachineConfigRestoreSpec struct { + // ClusterRef references the TalosCluster whose nodes will have their machine + // config restored. + ClusterRef LocalObjectRef `json:"clusterRef"` + + // BackupTimestamp identifies which backup to restore from. Must match the + // timestamp component of the S3 path written by a prior machineconfig-backup + // operation: {cluster}/machineconfigs/{backupTimestamp}/{hostname}.yaml. + // Format: 20060102T150405Z (UTC). + BackupTimestamp string `json:"backupTimestamp"` + + // TargetNodes is the optional list of node hostnames to restore. When empty + // all nodes in the cluster are restored. When set only the listed hostnames + // are restored. + // +optional + TargetNodes []string `json:"targetNodes,omitempty"` + + // S3SourceBucket is the S3 bucket containing the backup objects. Must match + // the bucket used during the original machineconfig-backup operation. + S3SourceBucket string `json:"s3SourceBucket"` + + // S3BackupSecretRef references a Secret containing S3 credentials. + // Falls back to seam-etcd-backup-config in seam-system when absent. + // platform-schema.md §10. + // +optional + S3BackupSecretRef *corev1.SecretReference `json:"s3BackupSecretRef,omitempty"` + + // Lineage is the sealed causal chain record for this root declaration. + // +optional + Lineage *lineage.SealedCausalChain `json:"lineage,omitempty"` +} + +// TalosMachineConfigRestoreStatus defines the observed state of TalosMachineConfigRestore. +type TalosMachineConfigRestoreStatus struct { + // ObservedGeneration is the generation of the spec last reconciled. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // Phase is the current phase of the restore operation. + // One of: Pending, Running, Succeeded, Failed, PartiallyFailed. + // +optional + Phase string `json:"phase,omitempty"` + + // JobName is the name of the most recently submitted Conductor executor Job. + // +optional + JobName string `json:"jobName,omitempty"` + + // OperationResult is the message from the Conductor OperationResult ConfigMap. + // +optional + OperationResult string `json:"operationResult,omitempty"` + + // RestoredNodes is the list of node hostnames successfully restored. + // +optional + RestoredNodes []string `json:"restoredNodes,omitempty"` + + // Conditions is the list of status conditions for this TalosMachineConfigRestore. + // Condition types: Ready, Running, Degraded, S3SourceAbsent, LineageSynced. + // +optional + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// TalosMachineConfigRestore triggers a machine config restore for target nodes of a +// cluster. The Conductor executor downloads each node's config from S3 at +// {cluster}/machineconfigs/{backupTimestamp}/{hostname}.yaml and applies it via +// ApplyConfiguration. Named Conductor capability: machineconfig-restore. +// platform-schema.md §11. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=mcr +// +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=".spec.clusterRef.name" +// +kubebuilder:printcolumn:name="Timestamp",type=string,JSONPath=".spec.backupTimestamp" +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=".status.phase" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" +type TalosMachineConfigRestore struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TalosMachineConfigRestoreSpec `json:"spec,omitempty"` + Status TalosMachineConfigRestoreStatus `json:"status,omitempty"` +} + +// TalosMachineConfigRestoreList is the list type for TalosMachineConfigRestore. +// +// +kubebuilder:object:root=true +type TalosMachineConfigRestoreList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []TalosMachineConfigRestore `json:"items"` +} + +func init() { + SchemeBuilder.Register(&TalosMachineConfigRestore{}, &TalosMachineConfigRestoreList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 0f2b41c..68010a1 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1151,3 +1151,342 @@ func (in *TalosMachineConfigBackupStatus) DeepCopy() *TalosMachineConfigBackupSt in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigRestore) DeepCopyInto(out *TalosMachineConfigRestore) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigRestore. +func (in *TalosMachineConfigRestore) DeepCopy() *TalosMachineConfigRestore { + if in == nil { + return nil + } + out := new(TalosMachineConfigRestore) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosMachineConfigRestore) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigRestoreSpec) DeepCopyInto(out *TalosMachineConfigRestoreSpec) { + *out = *in + out.ClusterRef = in.ClusterRef + if in.TargetNodes != nil { + in, out := &in.TargetNodes, &out.TargetNodes + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.S3BackupSecretRef != nil { + in, out := &in.S3BackupSecretRef, &out.S3BackupSecretRef + *out = new(corev1.SecretReference) + **out = **in + } + if in.Lineage != nil { + in, out := &in.Lineage, &out.Lineage + *out = new(lineage.SealedCausalChain) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigRestoreSpec. +func (in *TalosMachineConfigRestoreSpec) DeepCopy() *TalosMachineConfigRestoreSpec { + if in == nil { + return nil + } + out := new(TalosMachineConfigRestoreSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigRestoreStatus) DeepCopyInto(out *TalosMachineConfigRestoreStatus) { + *out = *in + if in.RestoredNodes != nil { + in, out := &in.RestoredNodes, &out.RestoredNodes + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigRestoreStatus. +func (in *TalosMachineConfigRestoreStatus) DeepCopy() *TalosMachineConfigRestoreStatus { + if in == nil { + return nil + } + out := new(TalosMachineConfigRestoreStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigRestoreList) DeepCopyInto(out *TalosMachineConfigRestoreList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TalosMachineConfigRestore, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigRestoreList. +func (in *TalosMachineConfigRestoreList) DeepCopy() *TalosMachineConfigRestoreList { + if in == nil { + return nil + } + out := new(TalosMachineConfigRestoreList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosMachineConfigRestoreList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigBackupSchedule) DeepCopyInto(out *TalosMachineConfigBackupSchedule) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigBackupSchedule. +func (in *TalosMachineConfigBackupSchedule) DeepCopy() *TalosMachineConfigBackupSchedule { + if in == nil { + return nil + } + out := new(TalosMachineConfigBackupSchedule) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosMachineConfigBackupSchedule) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigBackupScheduleSpec) DeepCopyInto(out *TalosMachineConfigBackupScheduleSpec) { + *out = *in + out.ClusterRef = in.ClusterRef + out.S3Destination = in.S3Destination + if in.S3BackupSecretRef != nil { + in, out := &in.S3BackupSecretRef, &out.S3BackupSecretRef + *out = new(corev1.SecretReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigBackupScheduleSpec. +func (in *TalosMachineConfigBackupScheduleSpec) DeepCopy() *TalosMachineConfigBackupScheduleSpec { + if in == nil { + return nil + } + out := new(TalosMachineConfigBackupScheduleSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigBackupScheduleStatus) DeepCopyInto(out *TalosMachineConfigBackupScheduleStatus) { + *out = *in + if in.NextRunAt != nil { + in, out := &in.NextRunAt, &out.NextRunAt + *out = (*in).DeepCopy() + } + if in.LastRunAt != nil { + in, out := &in.LastRunAt, &out.LastRunAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigBackupScheduleStatus. +func (in *TalosMachineConfigBackupScheduleStatus) DeepCopy() *TalosMachineConfigBackupScheduleStatus { + if in == nil { + return nil + } + out := new(TalosMachineConfigBackupScheduleStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosMachineConfigBackupScheduleList) DeepCopyInto(out *TalosMachineConfigBackupScheduleList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TalosMachineConfigBackupSchedule, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosMachineConfigBackupScheduleList. +func (in *TalosMachineConfigBackupScheduleList) DeepCopy() *TalosMachineConfigBackupScheduleList { + if in == nil { + return nil + } + out := new(TalosMachineConfigBackupScheduleList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosMachineConfigBackupScheduleList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosEtcdBackupSchedule) DeepCopyInto(out *TalosEtcdBackupSchedule) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosEtcdBackupSchedule. +func (in *TalosEtcdBackupSchedule) DeepCopy() *TalosEtcdBackupSchedule { + if in == nil { + return nil + } + out := new(TalosEtcdBackupSchedule) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosEtcdBackupSchedule) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosEtcdBackupScheduleSpec) DeepCopyInto(out *TalosEtcdBackupScheduleSpec) { + *out = *in + out.ClusterRef = in.ClusterRef + out.S3Destination = in.S3Destination + if in.EtcdBackupS3SecretRef != nil { + in, out := &in.EtcdBackupS3SecretRef, &out.EtcdBackupS3SecretRef + *out = new(corev1.SecretReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosEtcdBackupScheduleSpec. +func (in *TalosEtcdBackupScheduleSpec) DeepCopy() *TalosEtcdBackupScheduleSpec { + if in == nil { + return nil + } + out := new(TalosEtcdBackupScheduleSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosEtcdBackupScheduleStatus) DeepCopyInto(out *TalosEtcdBackupScheduleStatus) { + *out = *in + if in.NextRunAt != nil { + in, out := &in.NextRunAt, &out.NextRunAt + *out = (*in).DeepCopy() + } + if in.LastRunAt != nil { + in, out := &in.LastRunAt, &out.LastRunAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosEtcdBackupScheduleStatus. +func (in *TalosEtcdBackupScheduleStatus) DeepCopy() *TalosEtcdBackupScheduleStatus { + if in == nil { + return nil + } + out := new(TalosEtcdBackupScheduleStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TalosEtcdBackupScheduleList) DeepCopyInto(out *TalosEtcdBackupScheduleList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TalosEtcdBackupSchedule, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TalosEtcdBackupScheduleList. +func (in *TalosEtcdBackupScheduleList) DeepCopy() *TalosEtcdBackupScheduleList { + if in == nil { + return nil + } + out := new(TalosEtcdBackupScheduleList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TalosEtcdBackupScheduleList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} diff --git a/cmd/platform/main.go b/cmd/platform/main.go index 86a4291..fbcac81 100644 --- a/cmd/platform/main.go +++ b/cmd/platform/main.go @@ -220,6 +220,32 @@ func main() { os.Exit(1) } + if err := (&controller.MachineConfigRestoreReconciler{ + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorder("machineconfigrestore-controller"), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "MachineConfigRestore") + os.Exit(1) + } + + if err := (&controller.MachineConfigBackupScheduleReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "MachineConfigBackupSchedule") + os.Exit(1) + } + + if err := (&controller.EtcdBackupScheduleReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "EtcdBackupSchedule") + os.Exit(1) + } + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) diff --git a/internal/controller/etcdbackupschedule_reconciler.go b/internal/controller/etcdbackupschedule_reconciler.go new file mode 100644 index 0000000..c1d2617 --- /dev/null +++ b/internal/controller/etcdbackupschedule_reconciler.go @@ -0,0 +1,143 @@ +package controller + +// EtcdBackupScheduleReconciler reconciles TalosEtcdBackupSchedule CRs. +// +// Creates an EtcdMaintenance CR with operation=backup each time the interval +// specified in spec.schedule elapses. The schedule field is a Go duration string. +// Returns RequeueAfter = remaining time until the next run. +// platform-schema.md §10. + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" +) + +// EtcdBackupScheduleReconciler reconciles TalosEtcdBackupSchedule objects. +type EtcdBackupScheduleReconciler struct { + Client client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosetcdbackupschedules,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosetcdbackupschedules/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=etcdmaintenances,verbs=get;list;watch;create;update;patch;delete + +func (r *EtcdBackupScheduleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + sched := &platformv1alpha1.TalosEtcdBackupSchedule{} + if err := r.Client.Get(ctx, req.NamespacedName, sched); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get TalosEtcdBackupSchedule %s: %w", req.NamespacedName, err) + } + + patchBase := client.MergeFrom(sched.DeepCopy()) + defer func() { + if err := r.Client.Status().Patch(ctx, sched, patchBase); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "failed to patch TalosEtcdBackupSchedule status", + "name", sched.Name, "namespace", sched.Namespace) + } + } + }() + + sched.Status.ObservedGeneration = sched.Generation + + interval, err := time.ParseDuration(sched.Spec.Schedule) + if err != nil || interval <= 0 { + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeEtcdBackupScheduleActive, + metav1.ConditionFalse, + platformv1alpha1.ReasonEtcdBackupScheduleParseError, + fmt.Sprintf("spec.schedule %q is not a valid Go duration: %v", sched.Spec.Schedule, err), + sched.Generation, + ) + return ctrl.Result{}, nil + } + + now := time.Now().UTC() + + due := sched.Status.LastRunAt == nil || now.After(sched.Status.LastRunAt.Time.Add(interval)) + if !due { + next := sched.Status.LastRunAt.Time.Add(interval) + remaining := next.Sub(now) + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeEtcdBackupScheduleActive, + metav1.ConditionTrue, + platformv1alpha1.ReasonEtcdBackupScheduleNextRunPending, + fmt.Sprintf("Next etcd backup at %s.", next.Format(time.RFC3339)), + sched.Generation, + ) + return ctrl.Result{RequeueAfter: remaining}, nil + } + + ts := now.Format("20060102t150405z") + emName := fmt.Sprintf("%s-etcdsched-%s", sched.Spec.ClusterRef.Name, ts) + + // S3Ref for EtcdMaintenance: bucket from schedule spec, key derived from cluster and timestamp. + s3KeyPrefix := fmt.Sprintf("%s/etcd/%s/snapshot.db", sched.Spec.ClusterRef.Name, ts) + s3Ref := platformv1alpha1.S3Ref{ + Bucket: sched.Spec.S3Destination.Bucket, + Key: s3KeyPrefix, + } + + em := &platformv1alpha1.EtcdMaintenance{ + ObjectMeta: metav1.ObjectMeta{ + Name: emName, + Namespace: sched.Namespace, + }, + Spec: platformv1alpha1.EtcdMaintenanceSpec{ + ClusterRef: sched.Spec.ClusterRef, + Operation: platformv1alpha1.EtcdMaintenanceOperationBackup, + S3Destination: &s3Ref, + EtcdBackupS3SecretRef: sched.Spec.EtcdBackupS3SecretRef, + }, + } + + if createErr := r.Client.Create(ctx, em); createErr != nil { + if !apierrors.IsAlreadyExists(createErr) { + return ctrl.Result{}, fmt.Errorf("EtcdBackupScheduleReconciler: create EtcdMaintenance CR: %w", createErr) + } + } else { + logger.Info("created scheduled EtcdMaintenance backup", + "schedule", sched.Name, "etcdmaintenance", emName) + } + + nowMeta := metav1.NewTime(now) + sched.Status.LastRunAt = &nowMeta + nextRun := metav1.NewTime(now.Add(interval)) + sched.Status.NextRunAt = &nextRun + sched.Status.LastBackupName = emName + + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeEtcdBackupScheduleActive, + metav1.ConditionTrue, + platformv1alpha1.ReasonEtcdBackupScheduleNextRunPending, + fmt.Sprintf("EtcdMaintenance %s created. Next backup at %s.", emName, nextRun.Format(time.RFC3339)), + sched.Generation, + ) + + return ctrl.Result{RequeueAfter: interval}, nil +} + +// SetupWithManager registers EtcdBackupScheduleReconciler with the manager. +func (r *EtcdBackupScheduleReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&platformv1alpha1.TalosEtcdBackupSchedule{}). + Complete(r) +} diff --git a/internal/controller/machineconfigbackupschedule_reconciler.go b/internal/controller/machineconfigbackupschedule_reconciler.go new file mode 100644 index 0000000..9ebc580 --- /dev/null +++ b/internal/controller/machineconfigbackupschedule_reconciler.go @@ -0,0 +1,136 @@ +package controller + +// MachineConfigBackupScheduleReconciler reconciles TalosMachineConfigBackupSchedule CRs. +// +// Creates a TalosMachineConfigBackup CR each time the interval specified in +// spec.schedule elapses. The schedule field is a Go duration string (e.g. "24h"). +// Returns RequeueAfter = remaining time until the next run. +// platform-schema.md §11. + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" +) + +// MachineConfigBackupScheduleReconciler reconciles TalosMachineConfigBackupSchedule objects. +type MachineConfigBackupScheduleReconciler struct { + Client client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigbackupschedules,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigbackupschedules/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigbackups,verbs=get;list;watch;create;update;patch;delete + +func (r *MachineConfigBackupScheduleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + sched := &platformv1alpha1.TalosMachineConfigBackupSchedule{} + if err := r.Client.Get(ctx, req.NamespacedName, sched); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get TalosMachineConfigBackupSchedule %s: %w", req.NamespacedName, err) + } + + patchBase := client.MergeFrom(sched.DeepCopy()) + defer func() { + if err := r.Client.Status().Patch(ctx, sched, patchBase); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "failed to patch TalosMachineConfigBackupSchedule status", + "name", sched.Name, "namespace", sched.Namespace) + } + } + }() + + sched.Status.ObservedGeneration = sched.Generation + + interval, err := time.ParseDuration(sched.Spec.Schedule) + if err != nil || interval <= 0 { + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeMCBScheduleActive, + metav1.ConditionFalse, + platformv1alpha1.ReasonMCBScheduleParseError, + fmt.Sprintf("spec.schedule %q is not a valid Go duration: %v", sched.Spec.Schedule, err), + sched.Generation, + ) + return ctrl.Result{}, nil + } + + now := time.Now().UTC() + + // Determine if a run is due. + due := sched.Status.LastRunAt == nil || now.After(sched.Status.LastRunAt.Time.Add(interval)) + if !due { + next := sched.Status.LastRunAt.Time.Add(interval) + remaining := next.Sub(now) + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeMCBScheduleActive, + metav1.ConditionTrue, + platformv1alpha1.ReasonMCBScheduleNextRunPending, + fmt.Sprintf("Next backup at %s.", next.Format(time.RFC3339)), + sched.Generation, + ) + return ctrl.Result{RequeueAfter: remaining}, nil + } + + // Create a TalosMachineConfigBackup CR. + ts := now.Format("20060102t150405z") + backupName := fmt.Sprintf("%s-sched-%s", sched.Spec.ClusterRef.Name, ts) + backup := &platformv1alpha1.TalosMachineConfigBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: backupName, + Namespace: sched.Namespace, + }, + Spec: platformv1alpha1.TalosMachineConfigBackupSpec{ + ClusterRef: sched.Spec.ClusterRef, + S3Destination: sched.Spec.S3Destination, + S3BackupSecretRef: sched.Spec.S3BackupSecretRef, + }, + } + + if createErr := r.Client.Create(ctx, backup); createErr != nil { + if !apierrors.IsAlreadyExists(createErr) { + return ctrl.Result{}, fmt.Errorf("MachineConfigBackupScheduleReconciler: create backup CR: %w", createErr) + } + } else { + logger.Info("created scheduled TalosMachineConfigBackup", + "schedule", sched.Name, "backup", backupName) + } + + nowMeta := metav1.NewTime(now) + sched.Status.LastRunAt = &nowMeta + nextRun := metav1.NewTime(now.Add(interval)) + sched.Status.NextRunAt = &nextRun + sched.Status.LastBackupName = backupName + + platformv1alpha1.SetCondition( + &sched.Status.Conditions, + platformv1alpha1.ConditionTypeMCBScheduleActive, + metav1.ConditionTrue, + platformv1alpha1.ReasonMCBScheduleNextRunPending, + fmt.Sprintf("Backup %s created. Next backup at %s.", backupName, nextRun.Format(time.RFC3339)), + sched.Generation, + ) + + return ctrl.Result{RequeueAfter: interval}, nil +} + +// SetupWithManager registers MachineConfigBackupScheduleReconciler with the manager. +func (r *MachineConfigBackupScheduleReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&platformv1alpha1.TalosMachineConfigBackupSchedule{}). + Complete(r) +} diff --git a/internal/controller/machineconfigrestore_reconciler.go b/internal/controller/machineconfigrestore_reconciler.go new file mode 100644 index 0000000..597fad1 --- /dev/null +++ b/internal/controller/machineconfigrestore_reconciler.go @@ -0,0 +1,276 @@ +package controller + +// MachineConfigRestoreReconciler reconciles TalosMachineConfigRestore CRs. +// +// Pattern mirrors MachineConfigBackupReconciler: read cluster RunnerConfig, +// gate on capability, project S3 credentials, submit a Conductor executor Job, +// watch OperationResult ConfigMap for completion. conductor-schema.md §5 §17. +// +// Named Conductor capability: machineconfig-restore. +// platform-schema.md §11 TalosMachineConfigRestore. +// +// CP-INV-003: RunnerConfig is generated at runtime, never hand-coded. +// INV-018: gate failures are permanent -- backoffLimit=0, no retries. + +import ( + "context" + "fmt" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientevents "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" +) + +const capabilityMachineConfigRestore = "machineconfig-restore" + +// MachineConfigRestoreReconciler reconciles TalosMachineConfigRestore objects. +type MachineConfigRestoreReconciler struct { + Client client.Client + APIReader client.Reader + Scheme *runtime.Scheme + Recorder clientevents.EventRecorder +} + +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigrestores,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigrestores/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=platform.ontai.dev,resources=talosmachineconfigrestores/finalizers,verbs=update +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +// +kubebuilder:rbac:groups=infrastructure.ontai.dev,resources=infrastructurerunnerconfigs,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.ontai.dev,resources=infrastructuretalosclusteroperationresults,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch + +func (r *MachineConfigRestoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + mcr := &platformv1alpha1.TalosMachineConfigRestore{} + if err := r.Client.Get(ctx, req.NamespacedName, mcr); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get TalosMachineConfigRestore %s: %w", req.NamespacedName, err) + } + + patchBase := client.MergeFrom(mcr.DeepCopy()) + defer func() { + if err := r.Client.Status().Patch(ctx, mcr, patchBase); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "failed to patch TalosMachineConfigRestore status", + "name", mcr.Name, "namespace", mcr.Namespace) + } + } + }() + + mcr.Status.ObservedGeneration = mcr.Generation + + // Initialize LineageSynced on first observation. + if platformv1alpha1.FindCondition(mcr.Status.Conditions, platformv1alpha1.ConditionTypeLineageSynced) == nil { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeLineageSynced, + metav1.ConditionFalse, + platformv1alpha1.ReasonLineageControllerAbsent, + "InfrastructureLineageController is not yet deployed.", + mcr.Generation, + ) + } + + // Already complete -- one-shot CR. + readyCond := platformv1alpha1.FindCondition(mcr.Status.Conditions, platformv1alpha1.ConditionTypeMachineConfigRestoreReady) + if readyCond != nil && readyCond.Status == metav1.ConditionTrue { + mcr.Status.Phase = "Succeeded" + return ctrl.Result{}, nil + } + + // Gate: backupTimestamp must be non-empty. + if mcr.Spec.BackupTimestamp == "" { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreS3Absent, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreS3Absent, + "spec.backupTimestamp is required. platform-schema.md §11.", + mcr.Generation, + ) + mcr.Status.Phase = "Failed" + return ctrl.Result{}, nil + } + + // Gate: s3SourceBucket must be non-empty. + if mcr.Spec.S3SourceBucket == "" { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreS3Absent, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreS3Absent, + "spec.s3SourceBucket is required. platform-schema.md §11.", + mcr.Generation, + ) + mcr.Status.Phase = "Failed" + return ctrl.Result{}, nil + } + + // Gate: read the cluster RunnerConfig from ont-system and verify capability. + clusterRC, err := getClusterRunnerConfig(ctx, r.Client, mcr.Spec.ClusterRef.Name) + if err != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: get cluster RunnerConfig: %w", err) + } + if clusterRC == nil { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeCapabilityUnavailable, + metav1.ConditionTrue, + platformv1alpha1.ReasonRunnerConfigNotFound, + "Cluster RunnerConfig not yet present in ont-system. Waiting for Conductor agent.", + mcr.Generation, + ) + mcr.Status.Phase = "Pending" + return ctrl.Result{RequeueAfter: capabilityUnavailableRetryInterval}, nil + } + if !hasCapability(clusterRC, capabilityMachineConfigRestore) { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeCapabilityUnavailable, + metav1.ConditionTrue, + platformv1alpha1.ReasonCapabilityNotPublished, + fmt.Sprintf("Capability %q not yet published by Conductor agent.", capabilityMachineConfigRestore), + mcr.Generation, + ) + mcr.Status.Phase = "Pending" + return ctrl.Result{RequeueAfter: capabilityUnavailableRetryInterval}, nil + } + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeCapabilityUnavailable, + metav1.ConditionFalse, + platformv1alpha1.ReasonCapabilityNotPublished, + "", + mcr.Generation, + ) + + jobName := operationalJobName(mcr.Name, capabilityMachineConfigRestore) + + existingJob, err := getOperationalJob(ctx, r.Client, mcr.Namespace, jobName) + if err != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: check job: %w", err) + } + + if existingJob == nil { + s3Name, s3NS, found, sErr := resolveS3BackupSecretRef(ctx, r.Client, mcr.Spec.S3BackupSecretRef) + if sErr != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: resolve S3 secret: %w", sErr) + } + if !found { + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreS3Absent, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreS3Absent, + "No S3 credentials configured: spec.s3BackupSecretRef is absent and seam-etcd-backup-config Secret not found in seam-system. platform-schema.md §10.", + mcr.Generation, + ) + r.Recorder.Eventf(mcr, nil, "Warning", "S3CredentialsAbsent", + "TalosMachineConfigRestore %s/%s: no S3 credentials configured", mcr.Namespace, mcr.Name) + mcr.Status.Phase = "Failed" + return ctrl.Result{}, nil + } + + s3EnvSecretName := mcr.Name + s3EnvSecretSuffix + if err := ensureS3EnvSecretFor(ctx, r.Client, r.Scheme, s3Name, s3NS, mcr, s3EnvSecretName, mcr.Namespace); err != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: project S3 env secret: %w", err) + } + + job := jobSpec(jobName, mcr.Namespace, mcr.Spec.ClusterRef.Name, capabilityMachineConfigRestore, clusterRC.Spec.RunnerImage) + appendS3EnvFrom(job, s3EnvSecretName) + if err := controllerutil.SetControllerReference(mcr, job, r.Scheme); err != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: set owner reference: %w", err) + } + if err := r.Client.Create(ctx, job); err != nil { + return ctrl.Result{}, fmt.Errorf("MachineConfigRestoreReconciler: create job: %w", err) + } + mcr.Status.JobName = jobName + mcr.Status.Phase = "Running" + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreRunning, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreJobSubmitted, + fmt.Sprintf("Conductor executor Job %s submitted for %s.", jobName, capabilityMachineConfigRestore), + mcr.Generation, + ) + r.Recorder.Eventf(mcr, nil, "Normal", "JobSubmitted", "JobSubmitted", + "Submitted Conductor executor Job %s for machineconfig-restore", jobName) + logger.Info("submitted Conductor executor Job", + "name", mcr.Name, "jobName", jobName, "capability", capabilityMachineConfigRestore) + return ctrl.Result{RequeueAfter: operationalJobPollInterval}, nil + } + + complete, failed, result := readOperationRecord(ctx, r.Client, mcr.Spec.ClusterRef.Name, jobName) + if failed { + mcr.Status.OperationResult = result + mcr.Status.Phase = "Failed" + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreDegraded, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreJobFailed, + fmt.Sprintf("Conductor executor Job %s failed: %s", jobName, result), + mcr.Generation, + ) + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreRunning, + metav1.ConditionFalse, + platformv1alpha1.ReasonMachineConfigRestoreJobFailed, + "Job failed.", + mcr.Generation, + ) + r.Recorder.Eventf(mcr, nil, "Warning", "JobFailed", "JobFailed", + "Conductor executor Job %s failed: %s", jobName, result) + return ctrl.Result{}, nil + } + if !complete { + return ctrl.Result{RequeueAfter: operationalJobPollInterval}, nil + } + + mcr.Status.OperationResult = result + mcr.Status.Phase = "Succeeded" + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreRunning, + metav1.ConditionFalse, + platformv1alpha1.ReasonMachineConfigRestoreJobComplete, + "Job completed.", + mcr.Generation, + ) + platformv1alpha1.SetCondition( + &mcr.Status.Conditions, + platformv1alpha1.ConditionTypeMachineConfigRestoreReady, + metav1.ConditionTrue, + platformv1alpha1.ReasonMachineConfigRestoreJobComplete, + fmt.Sprintf("Conductor executor Job %s completed successfully.", jobName), + mcr.Generation, + ) + r.Recorder.Eventf(mcr, nil, "Normal", "JobComplete", "JobComplete", + "Conductor executor Job %s completed successfully", jobName) + logger.Info("TalosMachineConfigRestore complete", + "name", mcr.Name, "capability", capabilityMachineConfigRestore) + return ctrl.Result{}, nil +} + +// SetupWithManager registers MachineConfigRestoreReconciler with the manager. +func (r *MachineConfigRestoreReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&platformv1alpha1.TalosMachineConfigRestore{}). + Complete(r) +}