diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8e82031..90d2851 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,11 +15,17 @@ jobs: with: path: platform - - name: Checkout seam-core (replace dep) + - name: Checkout seam (replace dep) uses: actions/checkout@v4 with: - repository: ontai-dev/seam-core - path: seam-core + repository: ontai-dev/seam + path: seam + + - name: Checkout seam-sdk (replace dep) + uses: actions/checkout@v4 + with: + repository: ontai-dev/seam-sdk + path: seam-sdk - name: Checkout conductor (replace dep) uses: actions/checkout@v4 diff --git a/CLAUDE.md b/CLAUDE.md index 51e9122..729905d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,28 +2,52 @@ > Read ~/ontai/CLAUDE.md first. The constraints below extend the root constitutional document. ### Schema authority + Primary: docs/platform-schema.md -CRD schema authority: ~/ontai/seam-core/docs/seam-core-schema.md (Decision G: seam-core owns InfrastructureTalosCluster and InfrastructureRunnerConfig type definitions; platform owns reconciliation behavior) -Supporting: ~/ontai/conductor/docs/conductor-schema.md (Conductor capabilities and job protocol for operational Jobs) -Supporting: ~/ontai/guardian/docs/guardian-schema.md (RBACProfile gate and enable phase order) -Supporting: ~/ontai/wrapper/docs/wrapper-schema.md (PackInstance gate for Cilium deployment) + +Supporting (read before any design or implementation work): +- ~/ontai/seam/docs/seam-schema.md -- RunnerConfig and TalosCluster CRD schema (seam is the canonical module; not seam-core) +- ~/ontai/conductor/docs/conductor-schema.md -- Conductor capabilities and Job protocol for all operational Jobs +- ~/ontai/guardian/docs/guardian-schema.md -- RBACProfile gate and enable phase order +- ~/ontai/dispatcher/docs/dispatcher-schema.md -- PackInstalled gate for Cilium deployment (not wrapper) ### Invariants + INV-015 -- Deletion of TalosCluster never triggers physical cluster destruction. ClusterReset is the only path to cluster destruction. -CP-INV-001 -- The talos goclient is restricted to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only. Every other reconciler in platform has zero talos goclient access. (root INV-013) + +CP-INV-001 -- The talos goclient is restricted to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only. Every other reconciler in platform has zero talos goclient access. Any other file importing the talos goclient is an invariant violation. (root INV-013) + CP-INV-002 -- All reconcilers outside the Seam Infrastructure Provider observe cluster state through CAPI Machine status conditions and Kubernetes node labels only. No direct Talos API queries outside the provider. + CP-INV-003 -- RunnerConfig is generated by the operator using the shared runner library for all operational Job CRDs. Never hand-coded. Not generated for CAPI-managed lifecycle operations. + CP-INV-004 -- platform creates tenant namespaces. It is the sole namespace creation authority. No other component creates seam-tenant-{cluster-name} namespaces. + CP-INV-006 -- TalosClusterReset requires ontai.dev/reset-approved=true annotation before any reconciliation proceeds. + CP-INV-007 -- Leader election required. Lease name: platform-leader. Lease namespace: seam-system. + CP-INV-008 -- TalosCluster owns all CAPI objects for target clusters via ownerReference. No CAPI object exists in a tenant namespace without a TalosCluster ownerReference. + CP-INV-009 -- Every TalosConfigTemplate includes cluster.network.cni.name: none and Cilium-required BPF kernel parameters. Omitting them leaves nodes permanently NotReady. -CP-INV-010 -- Kueue is not used for any operation in platform. Operational runner Jobs submit directly. Kueue governs wrapper pack-deploy Jobs exclusively. + +CP-INV-010 -- Kueue is not used for any operation in platform. Operational runner Jobs submit directly. Kueue governs dispatcher pack-deploy Jobs exclusively. + CP-INV-011 -- The Seam Infrastructure Provider binary is distroless. Contains talos goclient and kube goclient only. (root INV-022) -CP-INV-012 -- platform is installed after guardian reaches operational state and its RBACProfile reaches provisioned=true. + +CP-INV-012 -- platform installs after guardian reaches operational state and its RBACProfile reaches provisioned=true. + CP-INV-013 -- CiliumPending on TalosCluster is not a degraded state. It is the expected state between CAPI cluster Running and Cilium PackInstance Ready. ### Session protocol additions -Step 4a -- Read platform-design.md in this repository. -Step 4b -- Determine which category the target CRD belongs to before implementing any reconciler: CAPI-managed lifecycle (TalosCluster target path, SeamInfrastructureCluster, SeamInfrastructureMachine -- no RunnerConfig); operational runner Job CRDs (TalosBackup, TalosEtcdMaintenance, TalosPKIRotation, TalosRecovery, TalosHardeningApply, TalosNodePatch, TalosCredentialRotation, TalosClusterReset -- verify capability in conductor-schema.md first). PlatformTenant is dropped: tenant coordination is handled by InfrastructureTalosCluster (mode=import or mode=bootstrap) plus the conductor role=tenant Deployment managed by the compiler enable bundle. -Step 4c -- For any Seam Infrastructure Provider session: confirm talos goclient access is bounded to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only. Any other file importing talos goclient is a CP-INV-001 violation. + +Step 4a -- Read platform-design.md in this repository before any implementation session. + +Step 4b -- Determine which category the target CRD belongs to before implementing any reconciler: +- CAPI-managed lifecycle path (TalosCluster target path, SeamInfrastructureCluster, SeamInfrastructureMachine): no RunnerConfig generated. These reconcilers must not import the talos goclient (only the Seam Infrastructure Provider reconcilers may). +- Dual-path CRDs (UpgradePolicy, NodeOperation, ClusterMaintenance): check spec.capi.enabled on the owning TalosCluster. CAPI path uses native CAPI machinery. Non-CAPI path submits a Conductor executor Job via RunnerConfig. Verify the named capability in conductor-schema.md before implementing. +- Direct Conductor Job CRDs (EtcdMaintenance, NodeMaintenance, PKIRotation, ClusterReset, TalosMachineConfigBackup, TalosMachineConfigRestore, MaintenanceBundle): always submit a Conductor executor Job regardless of capi.enabled. Verify the named capability in conductor-schema.md before implementing. +- Configuration-only CRDs (HardeningProfile): no Job submission. Validates spec and sets status conditions only. +- Schedule CRDs (TalosEtcdBackupSchedule, TalosMachineConfigBackupSchedule): create child operation CRs on interval. No direct Job submission. + +Step 4c -- For any Seam Infrastructure Provider session: confirm talos goclient access is bounded to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only. Run a grep for talos goclient imports across all reconciler files before and after any change. Any other file importing the talos goclient is a CP-INV-001 violation and must be corrected before the session closes. diff --git a/README.md b/README.md index 1f353e3..6a5e06c 100644 --- a/README.md +++ b/README.md @@ -1,139 +1,125 @@ # platform -**Seam Platform operator** -**API Group:** `platform.ontai.dev` (ONT-native), `infrastructure.cluster.x-k8s.io` (CAPI) -**Image:** `registry.ontai.dev/ontai-dev/platform:` +Platform is the CAPI management plane operator and ONT-native Infrastructure Provider for Talos. It owns the complete lifecycle of Talos clusters under Seam governance and all day-2 operational CRDs. ---- +## API Groups -## What this repository is +### seam.ontai.dev/v1alpha1 -`platform` is the CAPI management plane operator and the ONT-native Infrastructure -Provider for Talos. It owns the complete lifecycle of Talos clusters and all tenant -coordination. +| Kind | Short | Scope | Purpose | +|------|-------|-------|---------| +| TalosCluster | tc | Namespaced | Root CR for every cluster under Seam governance | +| ClusterLog | clog | Namespaced | Accumulated day-2 operation history per cluster per revision | ---- +These types are defined in `api/seam/v1alpha1/`. TalosCluster and ClusterLog live under `seam.ontai.dev`, not `platform.ontai.dev`. + +### platform.ontai.dev/v1alpha1 + +| Kind | Short | Scope | Purpose | +|------|-------|-------|---------| +| EtcdMaintenance | em | Namespaced | Etcd backup, restore, and defrag operations | +| TalosEtcdBackupSchedule | etcdbs | Namespaced | Recurring etcd backup schedule (creates EtcdMaintenance CRs) | +| NodeMaintenance | nm | Namespaced | Node-level patch, hardening-apply, credential-rotate | +| NodeOperation | nop | Namespaced | Node scale-up, decommission, reboot | +| PKIRotation | pkir | Namespaced | Cluster PKI certificate rotation | +| ClusterReset | crst | Namespaced | Destructive factory reset (human gate required) | +| ClusterMaintenance | cmaint | Namespaced | Maintenance window gate with CAPI pause integration | +| UpgradePolicy | upgp | Namespaced | Talos OS, Kubernetes, or combined stack upgrades | +| HardeningProfile | hp | Namespaced | Reusable hardening ruleset (configuration CR, not a Job trigger) | +| MaintenanceBundle | mb | Namespaced | Pre-compiled scheduling artifact from `compiler maintenance` | +| TalosMachineConfigBackup | mcb | Namespaced | Node machine config backup to S3 | +| TalosMachineConfigBackupSchedule | mcbs | Namespaced | Recurring machine config backup schedule | +| TalosMachineConfigRestore | mcr | Namespaced | Node machine config restore from S3 | + +### infrastructure.cluster.x-k8s.io (CAPI -- frozen) + +| Kind | Purpose | +|------|---------| +| SeamInfrastructureCluster | Cluster-level CAPI infrastructure reference | +| SeamInfrastructureMachine | Per-node CAPI infrastructure reference | -## CRDs - -### ONT-native (`platform.ontai.dev`) - -| Kind | Role | -|---|---| -| `TalosCluster` | Root declaration for a Talos target cluster (CAPI composition root) | -| `TalosClusterReset` | Affirmative CR for cluster destruction with human approval gate | -| `TalosBackup` | Operational runner Job for etcd snapshot backup | -| `TalosEtcdMaintenance` | Operational runner Job for etcd defragmentation and compaction | -| `TalosPKIRotation` | Operational runner Job for PKI certificate rotation | -| `TalosRecovery` | Operational runner Job for cluster recovery from etcd snapshot | -| `TalosHardeningApply` | Operational runner Job for CIS benchmark hardening | -| `TalosNodePatch` | Operational runner Job for targeted node configuration patch | -| `TalosNodeOperation` | Operational runner Job for node cordon, drain, and reboot sequences | -| `TalosCredentialRotation` | Operational runner Job for credential rotation | -| `ClusterMaintenance` | Operational runner Job for scheduled maintenance windows | -| `UpgradePolicy` | Declared upgrade policy for a cluster or node pool | -| `HardeningProfile` | Declared hardening target profile | -| `MaintenanceBundle` | Aggregate maintenance intent record | - -### CAPI Infrastructure Provider (`infrastructure.cluster.x-k8s.io`) - -| Kind | Role | -|---|---| -| `SeamInfrastructureCluster` | CAPI InfrastructureCluster implementation for Talos | -| `SeamInfrastructureMachine` | CAPI InfrastructureMachine implementation for Talos nodes | +These implement the CAPI InfrastructureCluster and InfrastructureMachine contracts. Schema is frozen and out of scope for platform development. --- ## Architecture -Platform operates in three modes. +Platform operates in three modes simultaneously on the management cluster. + +### CAPI target cluster lifecycle + +For `spec.capi.enabled=true` TalosCluster CRs, Platform creates and owns CAPI objects (SeamInfrastructureCluster, cluster.x-k8s.io/Cluster, TalosControlPlane, MachineDeployment, TalosConfigTemplate, SeamInfrastructureMachineTemplate) in the tenant namespace via ownerReference (CP-INV-008). CAPI controllers reconcile those objects to actual cluster state through the Seam Infrastructure Provider. + +The Seam Infrastructure Provider (SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler) is the only part of Platform that uses the talos goclient. It watches SeamInfrastructureMachine objects and delivers CABPT-rendered machineconfigs to pre-provisioned Talos nodes on port 50000. + +Dual-path CRDs (UpgradePolicy, NodeOperation, ClusterMaintenance) delegate to CAPI native machinery on this path. No Conductor Job is submitted for CAPI-managed lifecycle operations. + +### Direct bootstrap management cluster -**CAPI composition (target cluster lifecycle):** -`TalosCluster` is the root object. The platform reconciler creates and owns CAPI -objects (`Cluster`, `TalosControlPlane`, `MachineDeployment`, `SeamInfrastructureCluster`, -`SeamInfrastructureMachine`) as children of `TalosCluster`. The Seam Infrastructure -Provider reconcilers deliver machineconfigs to pre-provisioned nodes on port 50000 -via the talos goclient. +For the management cluster TalosCluster CR (`spec.capi.enabled=false`), CAPI is not used. Management cluster bootstrap is Seam-native: the Compiler generates machineconfigs, Platform submits a bootstrap Conductor Job, and the cluster forms without CAPI intermediation. -**Direct bootstrap Job (management cluster):** -The ONT bootstrap path via conductor Jobs is used for management cluster bootstrap. -CAPI is not involved in management cluster provisioning. +All operational CRDs apply to the management cluster via direct Conductor executor Job submission regardless of `capi.enabled`. -**Operational runner Jobs (Talos operational CRDs):** -Seven CRDs (`TalosBackup`, `TalosEtcdMaintenance`, `TalosPKIRotation`, `TalosRecovery`, -`TalosHardeningApply`, `TalosNodePatch`, `TalosCredentialRotation`) submit conductor -executor Jobs directly. Kueue is not used for any platform operation. +### Operational runner Jobs -**Tenant coordination:** -Platform creates `seam-tenant-{cluster-name}` namespaces. It is the sole namespace -creation authority. Tenant coordination CRDs (`UpgradePolicy`, `HardeningProfile`, -`MaintenanceBundle`) are pure record-keeping reconcilers with no runner Jobs. +For operational CRDs (EtcdMaintenance, NodeMaintenance, PKIRotation, ClusterReset, and the non-CAPI paths of UpgradePolicy and NodeOperation), Platform generates a RunnerConfig using the shared runner library and submits a Conductor executor Job directly. Kueue is not involved (CP-INV-010). Jobs submit directly without Kueue admission control. --- ## Key invariants -- The talos goclient is restricted exclusively to `SeamInfrastructureClusterReconciler` - and `SeamInfrastructureMachineReconciler`. All other reconcilers have zero talos - goclient access. -- `TalosCluster` deletion never triggers cluster destruction. `TalosClusterReset` - is the only destruction path, and requires `ontai.dev/reset-approved=true`. -- Kueue is not used for any operation in platform. -- Platform installs after guardian reaches `provisioned=true` on its `RBACProfile`. +**talos goclient restriction (CP-INV-001):** The talos goclient is restricted to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only. Every other reconciler in Platform has zero talos goclient access. ---- +**TalosCluster deletion never destroys a cluster (INV-015):** Deleting a TalosCluster CR cascades to owned CAPI objects through Kubernetes garbage collection but does not factory reset any node. ClusterReset is the only path to physical cluster destruction. -## Building +**Kueue is not used (CP-INV-010):** Platform does not use Kueue for any operation. Operational runner Jobs submit directly. Kueue governs dispatcher pack-deploy Jobs exclusively. -```sh -go build ./cmd/platform -``` +**RunnerConfig is generated by the operator (CP-INV-003):** RunnerConfig is always generated by Platform using the shared runner library. It is never hand-coded and is not generated for CAPI-managed lifecycle operations. -The binary is built into a distroless container image: +**ClusterReset requires human approval (CP-INV-006):** The `ontai.dev/reset-approved=true` annotation must be present on the ClusterReset CR before any reconciliation proceeds. -```sh -docker build -t registry.ontai.dev/ontai-dev/platform: . -``` +**Tenant namespaces (CP-INV-004):** Platform is the sole authority for creating `seam-tenant-{cluster-name}` namespaces. ---- +**Cilium install order (CP-INV-009, CP-INV-013):** Every TalosConfigTemplate includes `cluster.network.cni.name: none` and Cilium BPF kernel parameters. CiliumPending on TalosCluster is not a degraded state; it is the expected state between CAPI cluster Running and Cilium PackInstance Ready. -## Testing +**Install gate (CP-INV-012):** Platform installs after Guardian reaches operational state and its RBACProfile reaches `provisioned=true`. -```sh -go test ./test/unit/... -``` +**Leader election (CP-INV-007):** Leader election is required. Lease name: `platform-leader`. Lease namespace: `seam-system`. --- -## Schema and design reference +## Build and test -- `docs/platform-schema.md` - API contract, field definitions, status conditions -- `platform-design.md` - Implementation architecture and reconciler design +``` +make build +make test +make e2e # requires MGMT_KUBECONFIG +make docker-build IMAGE_REGISTRY=10.20.0.1:5000/ontai-dev +make docker-push IMAGE_REGISTRY=10.20.0.1:5000/ontai-dev +``` + +Operator Deployments and enable bundles always reference `:dev` in lab and development environments (INV-023). --- -## Status +## Schema -Alpha. Deployed and tested on management cluster (ccs-mgmt). -Tenant cluster onboarding is not yet verified end to end. -See [docs/platform-schema.md](./docs/platform-schema.md) -for current capability and known gaps. +Primary schema reference: `docs/platform-schema.md` -CRDs are deployed and reconciling on the live management cluster. -The schema specification is published at: -https://schema.ontai.dev/v1alpha1/ +Supporting references: -## Contributing +- `~/ontai/seam/docs/seam-schema.md` -- RunnerConfig and TalosCluster CRD schema +- `~/ontai/conductor/docs/conductor-schema.md` -- Conductor capabilities and Job protocol +- `~/ontai/guardian/docs/guardian-schema.md` -- RBACProfile gate and enable phase order +- `~/ontai/dispatcher/docs/dispatcher-schema.md` -- PackInstalled gate for Cilium + +--- -Read [CONTRIBUTING.md](./CONTRIBUTING.md) before opening a pull -request. Every new reconciliation behavior requires a written -specification and senior engineer sign-off before any code is -written. +## Issues -File issues at https://github.com/ontai-dev/platform/issues. -For security issues contact security@ontai.dev directly. +https://github.com/ontai-dev/platform/issues --- -*platform - Seam Platform Operator* -*Apache License, Version 2.0* +platform - Seam Platform Operator +Apache License, Version 2.0 diff --git a/api/infrastructure/v1alpha1/lineage_conditions.go b/api/infrastructure/v1alpha1/lineage_conditions.go index cf728a6..c2c974d 100644 --- a/api/infrastructure/v1alpha1/lineage_conditions.go +++ b/api/infrastructure/v1alpha1/lineage_conditions.go @@ -6,14 +6,14 @@ package v1alpha1 // // Seam Infrastructure Provider reconcilers reference these via the infrav1alpha1 // package alias; they continue to compile without modification. New code should -// prefer importing github.com/ontai-dev/seam-core/pkg/conditions directly. +// prefer importing github.com/ontai-dev/seam/pkg/conditions directly. -import "github.com/ontai-dev/seam-core/pkg/conditions" +import "github.com/ontai-dev/seam/pkg/conditions" const ( // ConditionTypeLineageSynced is the reserved condition type for lineage // synchronization status on every root declaration CR. - // Canonical source: github.com/ontai-dev/seam-core/pkg/conditions. + // Canonical source: github.com/ontai-dev/seam/pkg/conditions. ConditionTypeLineageSynced = conditions.ConditionTypeLineageSynced // ReasonLineageControllerAbsent is set when the reconciler initialises diff --git a/api/infrastructure/v1alpha1/seaminfrastructurecluster_types.go b/api/infrastructure/v1alpha1/seaminfrastructurecluster_types.go index 5dc58a6..c8bb51f 100644 --- a/api/infrastructure/v1alpha1/seaminfrastructurecluster_types.go +++ b/api/infrastructure/v1alpha1/seaminfrastructurecluster_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type constants for SeamInfrastructureCluster. diff --git a/api/infrastructure/v1alpha1/seaminfrastructuremachine_types.go b/api/infrastructure/v1alpha1/seaminfrastructuremachine_types.go index 62a90e6..ce52fb3 100644 --- a/api/infrastructure/v1alpha1/seaminfrastructuremachine_types.go +++ b/api/infrastructure/v1alpha1/seaminfrastructuremachine_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // NodeRole defines the role of a node in a Talos cluster. diff --git a/api/infrastructure/v1alpha1/zz_generated.deepcopy.go b/api/infrastructure/v1alpha1/zz_generated.deepcopy.go index 876f833..f5e0315 100644 --- a/api/infrastructure/v1alpha1/zz_generated.deepcopy.go +++ b/api/infrastructure/v1alpha1/zz_generated.deepcopy.go @@ -5,7 +5,7 @@ package v1alpha1 import ( - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) diff --git a/api/seam/v1alpha1/clusterlog_types.go b/api/seam/v1alpha1/clusterlog_types.go new file mode 100644 index 0000000..6e45b8d --- /dev/null +++ b/api/seam/v1alpha1/clusterlog_types.go @@ -0,0 +1,144 @@ +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ResultStatus is the terminal status of a TalosCluster day-2 operation. +// +kubebuilder:validation:Enum=Succeeded;Failed +type ResultStatus string + +const ( + ResultSucceeded ResultStatus = "Succeeded" + ResultFailed ResultStatus = "Failed" +) + +// OperationFailureReason is a structured failure description for +// a day-2 operation that reached a terminal Failed state. +type OperationFailureReason struct { + // Category classifies the failure domain. + // +kubebuilder:validation:Enum=ValidationFailure;CapabilityUnavailable;ExecutionFailure;ExternalDependencyFailure;InvariantViolation + Category string `json:"category"` + + // Reason is a human-readable description of the failure. + Reason string `json:"reason"` +} + +// OperationRecord is a single day-2 operation record within one +// talosVersion revision. Multiple records accumulate in the parent ClusterLog as +// operations are performed against the cluster. +type OperationRecord struct { + // Capability is the conductor capability that produced this record. + Capability string `json:"capability"` + + // JobRef is the Kubernetes Job name that produced this record. + // The platform reconciler uses this to correlate the record with the Job it submitted. + JobRef string `json:"jobRef"` + + // Status is the terminal status of the capability execution. + // +kubebuilder:validation:Enum=Succeeded;Failed + Status ResultStatus `json:"status"` + + // Message provides a human-readable summary of the outcome. + // +optional + Message string `json:"message,omitempty"` + + // StartedAt is the time the capability execution began. + // +optional + StartedAt *metav1.Time `json:"startedAt,omitempty"` + + // CompletedAt is the time the capability execution finished. + // +optional + CompletedAt *metav1.Time `json:"completedAt,omitempty"` + + // FailureReason is populated when Status is Failed. Nil on success. + // +optional + FailureReason *OperationFailureReason `json:"failureReason,omitempty"` +} + +// ClusterLogSpec is the accumulated day-2 operation history for one cluster, +// scoped to the current talosVersion revision. +// +// One CR per cluster. Created by the platform operator when the cluster tenant +// namespace is provisioned. Named by the cluster name. Lives in seam-tenant-{clusterRef}. +// +// When the cluster talosVersion is upgraded, the current revision is archived to +// the GraphQuery DB and a new revision begins: Revision increments, TalosVersion +// is updated, and Operations is cleared. +// +// conductor-schema.md §8. +type ClusterLogSpec struct { + // ClusterRef is the name of the TalosCluster this log accumulates. + ClusterRef string `json:"clusterRef"` + + // TalosVersion is the cluster talosVersion for the current active revision. + // Matches TalosCluster.spec.talosVersion at the time this revision began. + TalosVersion string `json:"talosVersion"` + + // Revision is the monotonic revision counter. Starts at 1. Increments on each + // talosVersion upgrade. Each revision holds the operations performed during that + // version epoch. Archived revisions are stored in the GraphQuery DB. + // +kubebuilder:default=1 + Revision int64 `json:"revision"` + + // Operations is the map of day-2 operation records for the current revision, + // keyed by Kubernetes Job name. Map keying enables O(1) lookup by the platform + // reconciler and clean serialization when archiving the revision to the GraphQuery DB. + // +optional + Operations map[string]OperationRecord `json:"operations,omitempty"` + + // OperationCount is the count of records in Operations for the current revision. + // Maintained by the writer alongside Operations so kubectl can display it + // as an integer column. Updated atomically with every Operations write. + // json tag intentionally omits omitempty so the writer always serializes 0. + // +optional + OperationCount int64 `json:"operationCount"` +} + +// ClusterLogStatus is the observed state. +// Currently empty; reserved for future conditions. +type ClusterLogStatus struct { + // ObservedGeneration is the last generation observed by any consumer. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=clog +// +kubebuilder:printcolumn:name="Cluster",type=string,JSONPath=`.spec.clusterRef` +// +kubebuilder:printcolumn:name="TalosVersion",type=string,JSONPath=`.spec.talosVersion` +// +kubebuilder:printcolumn:name="Revision",type=integer,JSONPath=`.spec.revision` +// +kubebuilder:printcolumn:name="Ops",type=integer,JSONPath=`.spec.operationCount` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// ClusterLog accumulates the day-2 operation history for one cluster. One CR per +// cluster, created when the platform operator provisions the cluster tenant namespace. +// Operations are appended by the Conductor execute-mode Job. On talosVersion upgrade, +// the current revision is archived to the GraphQuery DB and a new revision epoch begins. +// +// Named by the cluster name. Lives in seam-tenant-{clusterRef}. +// conductor-schema.md §8. +type ClusterLog struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ClusterLogSpec `json:"spec,omitempty"` + Status ClusterLogStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// ClusterLogList contains a list of ClusterLog. +type ClusterLogList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ClusterLog `json:"items"` +} + +func init() { + SchemeBuilder.Register( + &ClusterLog{}, + &ClusterLogList{}, + ) +} diff --git a/api/seam/v1alpha1/groupversion_info.go b/api/seam/v1alpha1/groupversion_info.go new file mode 100644 index 0000000..84be5ee --- /dev/null +++ b/api/seam/v1alpha1/groupversion_info.go @@ -0,0 +1,23 @@ +// Package v1alpha1 contains API types for the seam.ontai.dev/v1alpha1 API group +// as owned by platform. TalosCluster is the primary type declared here. +// +// +groupName=seam.ontai.dev +// +kubebuilder:object:generate=true +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group + version for all types in this package. + // API group: seam.ontai.dev. INV-008 -- this value is ground truth. + GroupVersion = schema.GroupVersion{Group: "seam.ontai.dev", Version: "v1alpha1"} + + // SchemeBuilder registers Go types with the Kubernetes runtime scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds all types in this package to the provided scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/seam/v1alpha1/taloscluster_types.go b/api/seam/v1alpha1/taloscluster_types.go new file mode 100644 index 0000000..a2a2192 --- /dev/null +++ b/api/seam/v1alpha1/taloscluster_types.go @@ -0,0 +1,255 @@ +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/ontai-dev/seam/pkg/lineage" +) + +// TalosClusterMode declares whether the cluster is bootstrapped or imported. +// +kubebuilder:validation:Enum=bootstrap;import +type TalosClusterMode string + +const ( + TalosClusterModeBootstrap TalosClusterMode = "bootstrap" + TalosClusterModeImport TalosClusterMode = "import" +) + +// TalosClusterRole declares the role of the cluster in the Seam topology. +// Mandatory on mode=import. +// +kubebuilder:validation:Enum=management;tenant +type TalosClusterRole string + +const ( + TalosClusterRoleManagement TalosClusterRole = "management" + TalosClusterRoleTenant TalosClusterRole = "tenant" +) + +// TalosClusterOrigin records how the cluster came to exist. +// +kubebuilder:validation:Enum=bootstrapped;imported +type TalosClusterOrigin string + +const ( + TalosClusterOriginBootstrapped TalosClusterOrigin = "bootstrapped" + TalosClusterOriginImported TalosClusterOrigin = "imported" +) + +// InfrastructureProvider declares the infrastructure provider backing a TalosCluster. +// +kubebuilder:validation:Enum=native;capi;screen +type InfrastructureProvider string + +const ( + // InfrastructureProviderNative is the default provider. + InfrastructureProviderNative InfrastructureProvider = "native" + + // InfrastructureProviderCAPI is an explicit alias for the CAPI-backed path. + InfrastructureProviderCAPI InfrastructureProvider = "capi" + + // InfrastructureProviderScreen is reserved for the future Screen operator (INV-021). + InfrastructureProviderScreen InfrastructureProvider = "screen" +) + +// LocalObjectRef is a reference to a Kubernetes object by name and namespace. +type LocalObjectRef struct { + // Name is the object name. + Name string `json:"name"` + + // Namespace is the object namespace. May be empty for cluster-scoped objects. + // +optional + Namespace string `json:"namespace,omitempty"` +} + +// CAPICiliumPackRef is a reference to the cluster-specific Cilium PackDelivery. +// platform-schema.md §2.3. +type CAPICiliumPackRef struct { + // Name is the PackDelivery CR name for the Cilium pack. + Name string `json:"name"` + + // Version is the PackDelivery version string. + Version string `json:"version"` +} + +// CAPIWorkerPool declares a worker node pool for a CAPI-managed target cluster. +type CAPIWorkerPool struct { + // Name is the pool identifier. Used as the MachineDeployment name suffix. + Name string `json:"name"` + + // Replicas is the desired number of worker nodes in this pool. + // +optional + Replicas int32 `json:"replicas,omitempty"` + + // SeamInfrastructureMachineNames lists the SeamInfrastructureMachine CR names + // pre-provisioned for this pool. One per node. + // +optional + SeamInfrastructureMachineNames []string `json:"seamInfrastructureMachineNames,omitempty"` +} + +// CAPIControlPlaneConfig declares the control plane configuration for a CAPI target cluster. +type CAPIControlPlaneConfig struct { + // Replicas is the desired number of control plane nodes. + // +optional + Replicas int32 `json:"replicas,omitempty"` +} + +// CAPIConfig holds CAPI integration settings for a target cluster. +// Only consulted when capi.enabled=true. platform-schema.md §5. +type CAPIConfig struct { + // Enabled determines whether this TalosCluster uses the CAPI path. + Enabled bool `json:"enabled"` + + // TalosVersion is the Talos version to use for TalosConfigTemplate generation. + // +optional + TalosVersion string `json:"talosVersion,omitempty"` + + // KubernetesVersion is the Kubernetes version for TalosControlPlane. + // +optional + KubernetesVersion string `json:"kubernetesVersion,omitempty"` + + // ControlPlane holds control plane configuration. Required when Enabled=true. + // +optional + ControlPlane *CAPIControlPlaneConfig `json:"controlPlane,omitempty"` + + // Workers is the list of worker node pools. + // +optional + Workers []CAPIWorkerPool `json:"workers,omitempty"` + + // CiliumPackRef references the cluster-specific Cilium PackDelivery. + // +optional + CiliumPackRef *CAPICiliumPackRef `json:"ciliumPackRef,omitempty"` +} + +// TalosClusterSpec is the declared desired state of a TalosCluster. +// platform-schema.md §4. +// +kubebuilder:validation:XValidation:rule="self.mode != 'import' || (has(self.role) && self.role != '')",message="role is required when mode is import" +type TalosClusterSpec struct { + // Mode declares whether this cluster is bootstrapped from scratch or imported. + // +kubebuilder:validation:Enum=bootstrap;import + Mode TalosClusterMode `json:"mode"` + + // Role declares the cluster role in the Seam topology. Mandatory on mode=import. + // +kubebuilder:validation:Enum=management;tenant + // +optional + Role TalosClusterRole `json:"role,omitempty"` + + // TalosVersion is the Talos OS version for this cluster. INV-012. + // +optional + TalosVersion string `json:"talosVersion,omitempty"` + + // KubernetesVersion is the Kubernetes version for this cluster. When + // spec.versionUpgrade=true, setting this field drives an UpgradeTypeKubernetes + // UpgradePolicy. Setting both talosVersion and kubernetesVersion drives an + // UpgradeTypeStack policy (sequential Talos then Kubernetes upgrade). + // +optional + KubernetesVersion string `json:"kubernetesVersion,omitempty"` + + // VersionUpgrade, when set to true, triggers a cluster-level rolling upgrade. + // Upgrade type is derived from which version fields are set: + // - talosVersion only: UpgradeTypeTalos + // - kubernetesVersion only: UpgradeTypeKubernetes + // - both: UpgradeTypeStack (sequential Talos then k8s) + // +optional + VersionUpgrade bool `json:"versionUpgrade,omitempty"` + + // ClusterEndpoint is the cluster VIP or primary API endpoint IP. + // +optional + ClusterEndpoint string `json:"clusterEndpoint,omitempty"` + + // NodeAddresses is the list of node IPs for DSNSReconciler A-record population. + // +optional + NodeAddresses []string `json:"nodeAddresses,omitempty"` + + // CAPI holds CAPI integration settings. When absent, direct bootstrap is used. + // +optional + CAPI *CAPIConfig `json:"capi,omitempty"` + + // InfrastructureProvider declares the infrastructure provider backing this cluster. + // +kubebuilder:validation:Enum=native;capi;screen + // +kubebuilder:default=native + // +optional + InfrastructureProvider InfrastructureProvider `json:"infrastructureProvider,omitempty"` + + // KubeconfigSecretRef is the name of the Secret containing the kubeconfig. + // Required on mode=import. Not used when CAPI manages the lifecycle. + // +optional + KubeconfigSecretRef string `json:"kubeconfigSecretRef,omitempty"` + + // TalosconfigSecretRef is the name of the Secret containing the talosconfig. + // +optional + TalosconfigSecretRef string `json:"talosconfigSecretRef,omitempty"` + + // Lineage is the sealed causal chain record. Immutable after creation. + // +optional + Lineage *lineage.SealedCausalChain `json:"lineage,omitempty"` + + // PkiRotationThresholdDays is the days before cert expiry at which a PKIRotation + // CR is auto-created. Default 30. platform-schema.md §13. + // +optional + // +kubebuilder:default=30 + // +kubebuilder:validation:Minimum=1 + PkiRotationThresholdDays int32 `json:"pkiRotationThresholdDays,omitempty"` + + // HardeningProfileRef references a HardeningProfile CR to apply at bootstrap. + // platform-schema.md §11. + // +optional + HardeningProfileRef *LocalObjectRef `json:"hardeningProfileRef,omitempty"` +} + +// TalosClusterStatus is the observed state of a TalosCluster. +type TalosClusterStatus struct { + // ObservedGeneration is the generation most recently reconciled. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // Origin records how this cluster came under Seam governance. + // +optional + Origin TalosClusterOrigin `json:"origin,omitempty"` + + // ObservedTalosVersion is the Talos version last confirmed running. + // +optional + ObservedTalosVersion string `json:"observedTalosVersion,omitempty"` + + // CAPIClusterRef is a reference to the owned CAPI Cluster object. + // Only set for CAPI-managed clusters (capi.enabled=true). + // +optional + CAPIClusterRef *LocalObjectRef `json:"capiClusterRef,omitempty"` + + // Conditions is the list of status conditions for this TalosCluster. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` + + // PkiExpiryDate is the earliest certificate expiry across the talosconfig and + // kubeconfig Secrets. Set by the TalosCluster reconciler. platform-schema.md §13. + // +optional + PkiExpiryDate *metav1.Time `json:"pkiExpiryDate,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=tc +// +kubebuilder:printcolumn:name="Mode",type=string,JSONPath=".spec.mode" +// +kubebuilder:printcolumn:name="Role",type=string,JSONPath=".spec.role" +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=".status.conditions[?(@.type==\"Ready\")].status" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" + +// TalosCluster is the platform CRD for a Talos cluster under Seam governance. +// platform-schema.md §4. Decision H. seam.ontai.dev/v1alpha1. +type TalosCluster struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TalosClusterSpec `json:"spec,omitempty"` + Status TalosClusterStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// TalosClusterList contains a list of TalosCluster. +type TalosClusterList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []TalosCluster `json:"items"` +} + +func init() { + SchemeBuilder.Register(&TalosCluster{}, &TalosClusterList{}) +} diff --git a/api/seam/v1alpha1/zz_generated.deepcopy.go b/api/seam/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000..e600476 --- /dev/null +++ b/api/seam/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,336 @@ +//go:build !ignore_autogenerated + +package v1alpha1 + +import ( + "github.com/ontai-dev/seam/pkg/lineage" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +func (in *ClusterLog) DeepCopyInto(out *ClusterLog) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status +} + +func (in *ClusterLog) DeepCopy() *ClusterLog { + if in == nil { + return nil + } + out := new(ClusterLog) + in.DeepCopyInto(out) + return out +} + +func (in *ClusterLog) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +func (in *ClusterLogList) DeepCopyInto(out *ClusterLogList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterLog, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +func (in *ClusterLogList) DeepCopy() *ClusterLogList { + if in == nil { + return nil + } + out := new(ClusterLogList) + in.DeepCopyInto(out) + return out +} + +func (in *ClusterLogList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +func (in *ClusterLogSpec) DeepCopyInto(out *ClusterLogSpec) { + *out = *in + if in.Operations != nil { + in, out := &in.Operations, &out.Operations + *out = make(map[string]OperationRecord, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } +} + +func (in *ClusterLogSpec) DeepCopy() *ClusterLogSpec { + if in == nil { + return nil + } + out := new(ClusterLogSpec) + in.DeepCopyInto(out) + return out +} + +func (in *ClusterLogStatus) DeepCopyInto(out *ClusterLogStatus) { + *out = *in +} + +func (in *ClusterLogStatus) DeepCopy() *ClusterLogStatus { + if in == nil { + return nil + } + out := new(ClusterLogStatus) + in.DeepCopyInto(out) + return out +} + +func (in *OperationFailureReason) DeepCopyInto(out *OperationFailureReason) { + *out = *in +} + +func (in *OperationFailureReason) DeepCopy() *OperationFailureReason { + if in == nil { + return nil + } + out := new(OperationFailureReason) + in.DeepCopyInto(out) + return out +} + +func (in *OperationRecord) DeepCopyInto(out *OperationRecord) { + *out = *in + if in.StartedAt != nil { + in, out := &in.StartedAt, &out.StartedAt + *out = (*in).DeepCopy() + } + if in.CompletedAt != nil { + in, out := &in.CompletedAt, &out.CompletedAt + *out = (*in).DeepCopy() + } + if in.FailureReason != nil { + in, out := &in.FailureReason, &out.FailureReason + *out = new(OperationFailureReason) + **out = **in + } +} + +func (in *OperationRecord) DeepCopy() *OperationRecord { + if in == nil { + return nil + } + out := new(OperationRecord) + in.DeepCopyInto(out) + return out +} + +func (in *CAPIConfig) DeepCopyInto(out *CAPIConfig) { + *out = *in + if in.ControlPlane != nil { + in, out := &in.ControlPlane, &out.ControlPlane + *out = new(CAPIControlPlaneConfig) + **out = **in + } + if in.Workers != nil { + in, out := &in.Workers, &out.Workers + *out = make([]CAPIWorkerPool, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.CiliumPackRef != nil { + in, out := &in.CiliumPackRef, &out.CiliumPackRef + *out = new(CAPICiliumPackRef) + **out = **in + } +} + +func (in *CAPIConfig) DeepCopy() *CAPIConfig { + if in == nil { + return nil + } + out := new(CAPIConfig) + in.DeepCopyInto(out) + return out +} + +func (in *CAPICiliumPackRef) DeepCopyInto(out *CAPICiliumPackRef) { + *out = *in +} + +func (in *CAPICiliumPackRef) DeepCopy() *CAPICiliumPackRef { + if in == nil { + return nil + } + out := new(CAPICiliumPackRef) + in.DeepCopyInto(out) + return out +} + +func (in *CAPIControlPlaneConfig) DeepCopyInto(out *CAPIControlPlaneConfig) { + *out = *in +} + +func (in *CAPIControlPlaneConfig) DeepCopy() *CAPIControlPlaneConfig { + if in == nil { + return nil + } + out := new(CAPIControlPlaneConfig) + in.DeepCopyInto(out) + return out +} + +func (in *CAPIWorkerPool) DeepCopyInto(out *CAPIWorkerPool) { + *out = *in + if in.SeamInfrastructureMachineNames != nil { + in, out := &in.SeamInfrastructureMachineNames, &out.SeamInfrastructureMachineNames + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +func (in *CAPIWorkerPool) DeepCopy() *CAPIWorkerPool { + if in == nil { + return nil + } + out := new(CAPIWorkerPool) + in.DeepCopyInto(out) + return out +} + +func (in *LocalObjectRef) DeepCopyInto(out *LocalObjectRef) { + *out = *in +} + +func (in *LocalObjectRef) DeepCopy() *LocalObjectRef { + if in == nil { + return nil + } + out := new(LocalObjectRef) + in.DeepCopyInto(out) + return out +} + +func (in *TalosCluster) DeepCopyInto(out *TalosCluster) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +func (in *TalosCluster) DeepCopy() *TalosCluster { + if in == nil { + return nil + } + out := new(TalosCluster) + in.DeepCopyInto(out) + return out +} + +func (in *TalosCluster) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +func (in *TalosClusterList) DeepCopyInto(out *TalosClusterList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TalosCluster, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +func (in *TalosClusterList) DeepCopy() *TalosClusterList { + if in == nil { + return nil + } + out := new(TalosClusterList) + in.DeepCopyInto(out) + return out +} + +func (in *TalosClusterList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +func (in *TalosClusterSpec) DeepCopyInto(out *TalosClusterSpec) { + *out = *in + if in.NodeAddresses != nil { + in, out := &in.NodeAddresses, &out.NodeAddresses + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.CAPI != nil { + in, out := &in.CAPI, &out.CAPI + *out = new(CAPIConfig) + (*in).DeepCopyInto(*out) + } + if in.Lineage != nil { + in, out := &in.Lineage, &out.Lineage + *out = new(lineage.SealedCausalChain) + **out = **in + } + if in.HardeningProfileRef != nil { + in, out := &in.HardeningProfileRef, &out.HardeningProfileRef + *out = new(LocalObjectRef) + **out = **in + } +} + +func (in *TalosClusterSpec) DeepCopy() *TalosClusterSpec { + if in == nil { + return nil + } + out := new(TalosClusterSpec) + in.DeepCopyInto(out) + return out +} + +func (in *TalosClusterStatus) DeepCopyInto(out *TalosClusterStatus) { + *out = *in + if in.CAPIClusterRef != nil { + in, out := &in.CAPIClusterRef, &out.CAPIClusterRef + *out = new(LocalObjectRef) + **out = **in + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.PkiExpiryDate != nil { + in, out := &in.PkiExpiryDate, &out.PkiExpiryDate + *out = (*in).DeepCopy() + } +} + +func (in *TalosClusterStatus) DeepCopy() *TalosClusterStatus { + if in == nil { + return nil + } + out := new(TalosClusterStatus) + in.DeepCopyInto(out) + return out +} diff --git a/api/v1alpha1/clustermaintenance_types.go b/api/v1alpha1/clustermaintenance_types.go index d2c7fe7..56c7694 100644 --- a/api/v1alpha1/clustermaintenance_types.go +++ b/api/v1alpha1/clustermaintenance_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for ClusterMaintenance. diff --git a/api/v1alpha1/clusterreset_types.go b/api/v1alpha1/clusterreset_types.go index b6d1c4e..f7c07db 100644 --- a/api/v1alpha1/clusterreset_types.go +++ b/api/v1alpha1/clusterreset_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for ClusterReset. diff --git a/api/v1alpha1/etcdmaintenance_types.go b/api/v1alpha1/etcdmaintenance_types.go index c3c458f..06703ef 100644 --- a/api/v1alpha1/etcdmaintenance_types.go +++ b/api/v1alpha1/etcdmaintenance_types.go @@ -4,7 +4,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // EtcdMaintenanceOperation declares the etcd lifecycle operation to perform. diff --git a/api/v1alpha1/hardeningprofile_types.go b/api/v1alpha1/hardeningprofile_types.go index 8a9a6c4..4181c36 100644 --- a/api/v1alpha1/hardeningprofile_types.go +++ b/api/v1alpha1/hardeningprofile_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for HardeningProfile. diff --git a/api/v1alpha1/lineage_conditions.go b/api/v1alpha1/lineage_conditions.go index 20383e5..5ac716d 100644 --- a/api/v1alpha1/lineage_conditions.go +++ b/api/v1alpha1/lineage_conditions.go @@ -6,9 +6,9 @@ package v1alpha1 // // Platform reconcilers reference these via the platformv1alpha1 package alias; // they continue to compile without modification. New code should prefer importing -// github.com/ontai-dev/seam-core/pkg/conditions directly. +// github.com/ontai-dev/seam/pkg/conditions directly. -import "github.com/ontai-dev/seam-core/pkg/conditions" +import "github.com/ontai-dev/seam/pkg/conditions" const ( // ConditionTypeLineageSynced is the reserved condition type for lineage @@ -20,7 +20,7 @@ const ( // 2. InfrastructureLineageController takes ownership on deployment, sets True. // 3. If InfrastructureLineageController is absent, remains False/LineageControllerAbsent. // - // Canonical source: github.com/ontai-dev/seam-core/pkg/conditions. + // Canonical source: github.com/ontai-dev/seam/pkg/conditions. ConditionTypeLineageSynced = conditions.ConditionTypeLineageSynced // ReasonLineageControllerAbsent is set when the reconciler initialises diff --git a/api/v1alpha1/machineconfigbackup_types.go b/api/v1alpha1/machineconfigbackup_types.go index 3d70284..f1115f6 100644 --- a/api/v1alpha1/machineconfigbackup_types.go +++ b/api/v1alpha1/machineconfigbackup_types.go @@ -4,7 +4,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for TalosMachineConfigBackup. diff --git a/api/v1alpha1/machineconfigrestore_types.go b/api/v1alpha1/machineconfigrestore_types.go index 7586045..41bae96 100644 --- a/api/v1alpha1/machineconfigrestore_types.go +++ b/api/v1alpha1/machineconfigrestore_types.go @@ -4,7 +4,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for TalosMachineConfigRestore. diff --git a/api/v1alpha1/maintenancebundle_types.go b/api/v1alpha1/maintenancebundle_types.go index 5b9f7bf..2220601 100644 --- a/api/v1alpha1/maintenancebundle_types.go +++ b/api/v1alpha1/maintenancebundle_types.go @@ -12,7 +12,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // MaintenanceBundleOperation declares the maintenance operation type. diff --git a/api/v1alpha1/nodemaintenance_types.go b/api/v1alpha1/nodemaintenance_types.go index a2d29f2..c93e552 100644 --- a/api/v1alpha1/nodemaintenance_types.go +++ b/api/v1alpha1/nodemaintenance_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // NodeMaintenanceOperation declares the node-level operation to perform. diff --git a/api/v1alpha1/nodeoperation_types.go b/api/v1alpha1/nodeoperation_types.go index a8a6dca..23d39ac 100644 --- a/api/v1alpha1/nodeoperation_types.go +++ b/api/v1alpha1/nodeoperation_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // NodeOperationType declares the node lifecycle operation to perform. diff --git a/api/v1alpha1/pkirotation_types.go b/api/v1alpha1/pkirotation_types.go index 2af3829..2f3f87e 100644 --- a/api/v1alpha1/pkirotation_types.go +++ b/api/v1alpha1/pkirotation_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // Condition type and reason constants for PKIRotation. diff --git a/api/v1alpha1/taloscluster_types.go b/api/v1alpha1/taloscluster_types.go index e0271b4..1bc7b2c 100644 --- a/api/v1alpha1/taloscluster_types.go +++ b/api/v1alpha1/taloscluster_types.go @@ -1,60 +1,60 @@ package v1alpha1 -// TalosCluster types are owned by seam-core (infrastructure.ontai.dev/v1alpha1). +// TalosCluster types are now owned by platform (seam.ontai.dev/v1alpha1). // Platform reconcilers reference these aliases; all field types and constants resolve -// to the seam-core definitions. T-2B-8. +// to the platform/api/seam/v1alpha1 definitions. MIGRATION-3.1. import ( - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" - "github.com/ontai-dev/seam-core/pkg/conditions" + seamv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + "github.com/ontai-dev/seam/pkg/conditions" ) -// Type aliases -- struct definitions moved to seam-core. These preserve the -// platformv1alpha1 package interface for all reconcilers without source edits. +// Type aliases -- struct definitions live in platform/api/seam/v1alpha1. +// These preserve the platformv1alpha1 package interface for all reconcilers without source edits. type ( - TalosCluster = seamcorev1alpha1.InfrastructureTalosCluster - TalosClusterList = seamcorev1alpha1.InfrastructureTalosClusterList - TalosClusterSpec = seamcorev1alpha1.InfrastructureTalosClusterSpec - TalosClusterStatus = seamcorev1alpha1.InfrastructureTalosClusterStatus - TalosClusterMode = seamcorev1alpha1.InfrastructureTalosClusterMode - TalosClusterRole = seamcorev1alpha1.InfrastructureTalosClusterRole - TalosClusterOrigin = seamcorev1alpha1.InfrastructureTalosClusterOrigin - InfrastructureProvider = seamcorev1alpha1.InfrastructureProvider - CAPIConfig = seamcorev1alpha1.InfrastructureCAPIConfig - CAPIControlPlaneConfig = seamcorev1alpha1.InfrastructureCAPIControlPlaneConfig - CAPIWorkerPool = seamcorev1alpha1.InfrastructureCAPIWorkerPool - CAPICiliumPackRef = seamcorev1alpha1.InfrastructureCAPICiliumPackRef - LocalObjectRef = seamcorev1alpha1.InfrastructureLocalObjectRef + TalosCluster = seamv1alpha1.TalosCluster + TalosClusterList = seamv1alpha1.TalosClusterList + TalosClusterSpec = seamv1alpha1.TalosClusterSpec + TalosClusterStatus = seamv1alpha1.TalosClusterStatus + TalosClusterMode = seamv1alpha1.TalosClusterMode + TalosClusterRole = seamv1alpha1.TalosClusterRole + TalosClusterOrigin = seamv1alpha1.TalosClusterOrigin + InfrastructureProvider = seamv1alpha1.InfrastructureProvider + CAPIConfig = seamv1alpha1.CAPIConfig + CAPIControlPlaneConfig = seamv1alpha1.CAPIControlPlaneConfig + CAPIWorkerPool = seamv1alpha1.CAPIWorkerPool + CAPICiliumPackRef = seamv1alpha1.CAPICiliumPackRef + LocalObjectRef = seamv1alpha1.LocalObjectRef ) // Mode constants. const ( - TalosClusterModeBootstrap = seamcorev1alpha1.InfrastructureTalosClusterModeBootstrap - TalosClusterModeImport = seamcorev1alpha1.InfrastructureTalosClusterModeImport + TalosClusterModeBootstrap = seamv1alpha1.TalosClusterModeBootstrap + TalosClusterModeImport = seamv1alpha1.TalosClusterModeImport ) // Role constants. const ( - TalosClusterRoleManagement = seamcorev1alpha1.InfrastructureTalosClusterRoleManagement - TalosClusterRoleTenant = seamcorev1alpha1.InfrastructureTalosClusterRoleTenant + TalosClusterRoleManagement = seamv1alpha1.TalosClusterRoleManagement + TalosClusterRoleTenant = seamv1alpha1.TalosClusterRoleTenant ) // Origin constants. const ( - TalosClusterOriginBootstrapped = seamcorev1alpha1.InfrastructureTalosClusterOriginBootstrapped - TalosClusterOriginImported = seamcorev1alpha1.InfrastructureTalosClusterOriginImported + TalosClusterOriginBootstrapped = seamv1alpha1.TalosClusterOriginBootstrapped + TalosClusterOriginImported = seamv1alpha1.TalosClusterOriginImported ) // InfrastructureProvider constants. const ( - InfrastructureProviderNative = seamcorev1alpha1.InfrastructureProviderNative - InfrastructureProviderCAPI = seamcorev1alpha1.InfrastructureProviderCAPI - InfrastructureProviderScreen = seamcorev1alpha1.InfrastructureProviderScreen + InfrastructureProviderNative = seamv1alpha1.InfrastructureProviderNative + InfrastructureProviderCAPI = seamv1alpha1.InfrastructureProviderCAPI + InfrastructureProviderScreen = seamv1alpha1.InfrastructureProviderScreen ) // Condition type constants for TalosCluster -- re-exported from seam-core/pkg/conditions. // Platform reconcilers reference these via the platformv1alpha1 alias; new code should -// import github.com/ontai-dev/seam-core/pkg/conditions directly. +// import github.com/ontai-dev/seam/pkg/conditions directly. const ( ConditionTypeReady = conditions.ConditionTypeReady ConditionTypeBootstrapping = conditions.ConditionTypeBootstrapping @@ -75,28 +75,28 @@ const ( // Reason constants for TalosCluster -- re-exported from seam-core/pkg/conditions. const ( - ReasonBootstrapJobSubmitted = conditions.ReasonBootstrapJobSubmitted - ReasonBootstrapJobComplete = conditions.ReasonBootstrapJobComplete - ReasonBootstrapJobFailed = conditions.ReasonBootstrapJobFailed - ReasonCAPIObjectsCreated = conditions.ReasonCAPIObjectsCreated - ReasonCAPIClusterRunning = conditions.ReasonCAPIClusterRunning - ReasonCiliumPackPending = conditions.ReasonCiliumPackPending - ReasonCiliumPackReady = conditions.ReasonCiliumPackReady - ReasonClusterReady = conditions.ReasonClusterReady - ReasonImportComplete = conditions.ReasonImportComplete - ReasonDegraded = conditions.ReasonDegraded - ReasonControlPlaneNodeUnreachable = conditions.ReasonControlPlaneNodeUnreachable - ReasonWorkerNodeUnreachable = conditions.ReasonWorkerNodeUnreachable + ReasonBootstrapJobSubmitted = conditions.ReasonBootstrapJobSubmitted + ReasonBootstrapJobComplete = conditions.ReasonBootstrapJobComplete + ReasonBootstrapJobFailed = conditions.ReasonBootstrapJobFailed + ReasonCAPIObjectsCreated = conditions.ReasonCAPIObjectsCreated + ReasonCAPIClusterRunning = conditions.ReasonCAPIClusterRunning + ReasonCiliumPackPending = conditions.ReasonCiliumPackPending + ReasonCiliumPackReady = conditions.ReasonCiliumPackReady + ReasonClusterReady = conditions.ReasonClusterReady + ReasonImportComplete = conditions.ReasonImportComplete + ReasonDegraded = conditions.ReasonDegraded + ReasonControlPlaneNodeUnreachable = conditions.ReasonControlPlaneNodeUnreachable + ReasonWorkerNodeUnreachable = conditions.ReasonWorkerNodeUnreachable ReasonConductorBootstrapComplete = conditions.ReasonConductorBootstrapComplete ReasonConductorBootstrapPending = conditions.ReasonConductorBootstrapPending - ReasonScreenNotImplemented = conditions.ReasonScreenNotImplemented - ReasonTalosVersionRequired = conditions.ReasonTalosVersionRequired - ReasonTalosConfigSecretAbsent = conditions.ReasonTalosConfigSecretAbsent - ReasonVersionUpgradeRequested = conditions.ReasonVersionUpgradeRequested - ReasonVersionUpgradeSubmitted = conditions.ReasonVersionUpgradeSubmitted - ReasonVersionUpgradeComplete = conditions.ReasonVersionUpgradeComplete - ReasonVersionRegressionAttempted = conditions.ReasonVersionRegressionAttempted - ReasonHardeningApplied = conditions.ReasonHardeningApplied - ReasonHardeningPending = conditions.ReasonHardeningPending - ReasonHardeningProfileNotValid = conditions.ReasonHardeningProfileNotValid + ReasonScreenNotImplemented = conditions.ReasonScreenNotImplemented + ReasonTalosVersionRequired = conditions.ReasonTalosVersionRequired + ReasonTalosConfigSecretAbsent = conditions.ReasonTalosConfigSecretAbsent + ReasonVersionUpgradeRequested = conditions.ReasonVersionUpgradeRequested + ReasonVersionUpgradeSubmitted = conditions.ReasonVersionUpgradeSubmitted + ReasonVersionUpgradeComplete = conditions.ReasonVersionUpgradeComplete + ReasonVersionRegressionAttempted = conditions.ReasonVersionRegressionAttempted + ReasonHardeningApplied = conditions.ReasonHardeningApplied + ReasonHardeningPending = conditions.ReasonHardeningPending + ReasonHardeningProfileNotValid = conditions.ReasonHardeningProfileNotValid ) diff --git a/api/v1alpha1/upgradepolicy_types.go b/api/v1alpha1/upgradepolicy_types.go index 5339428..ca113b3 100644 --- a/api/v1alpha1/upgradepolicy_types.go +++ b/api/v1alpha1/upgradepolicy_types.go @@ -3,7 +3,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // UpgradeType declares the type of upgrade to perform. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 68010a1..dc5244f 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -5,8 +5,7 @@ package v1alpha1 import ( - apiv1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -656,7 +655,7 @@ func (in *NodeMaintenanceSpec) DeepCopyInto(out *NodeMaintenanceSpec) { } if in.HardeningProfileRef != nil { in, out := &in.HardeningProfileRef, &out.HardeningProfileRef - *out = new(apiv1alpha1.InfrastructureLocalObjectRef) + *out = new(LocalObjectRef) **out = **in } if in.Lineage != nil { diff --git a/cmd/platform/main.go b/cmd/platform/main.go index fbcac81..4799a37 100644 --- a/cmd/platform/main.go +++ b/cmd/platform/main.go @@ -19,7 +19,8 @@ import ( infrav1alpha1 "github.com/ontai-dev/platform/api/infrastructure/v1alpha1" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" "github.com/ontai-dev/platform/internal/controller" ) @@ -29,8 +30,9 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(platformv1alpha1.AddToScheme(scheme)) utilruntime.Must(infrav1alpha1.AddToScheme(scheme)) - // TalosCluster and RunnerConfig types are now owned by seam-core. - // infrastructure.ontai.dev/v1alpha1. T-2B-8. + // TalosCluster and ClusterLog are owned by platform (seam.ontai.dev/v1alpha1). MIGRATION-3.1, MIGRATION-3.2. + utilruntime.Must(seamplatformv1alpha1.AddToScheme(scheme)) + // RunnerConfig and DriftSignal remain in seam-core. utilruntime.Must(seamcorev1alpha1.AddToScheme(scheme)) } diff --git a/config/crd/platform.ontai.dev_talosetcdbackupschedules.yaml b/config/crd/platform.ontai.dev_talosetcdbackupschedules.yaml new file mode 100644 index 0000000..4c918ab --- /dev/null +++ b/config/crd/platform.ontai.dev_talosetcdbackupschedules.yaml @@ -0,0 +1,221 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: talosetcdbackupschedules.platform.ontai.dev +spec: + group: platform.ontai.dev + names: + kind: TalosEtcdBackupSchedule + listKind: TalosEtcdBackupScheduleList + plural: talosetcdbackupschedules + shortNames: + - etcdbs + singular: talosetcdbackupschedule + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.clusterRef.name + name: Cluster + type: string + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .status.nextRunAt + name: NextRun + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + TalosEtcdBackupSchedule creates EtcdMaintenance CRs with operation=backup on a + repeating interval. The schedule field accepts Go duration strings (e.g. "24h"). + platform-schema.md §10. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TalosEtcdBackupScheduleSpec defines the desired state of + TalosEtcdBackupSchedule. + properties: + clusterRef: + description: ClusterRef references the TalosCluster to back up on + schedule. + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + etcdBackupS3SecretRef: + description: |- + EtcdBackupS3SecretRef references a Secret containing S3 backup credentials. + Falls back to seam-etcd-backup-config in seam-system when absent. + platform-schema.md §10. + properties: + name: + description: name is unique within a namespace to reference a + secret resource. + type: string + namespace: + description: namespace defines the space within which the secret + name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + s3Destination: + description: S3Destination is the S3 location to write etcd snapshots + to. + properties: + bucket: + description: Bucket is the S3 bucket name. + type: string + credentialsSecretRef: + description: |- + CredentialsSecretRef references the Secret containing S3 credentials. + The Secret must be in ont-system. + properties: + name: + description: Name is the Secret name. + type: string + namespace: + description: |- + Namespace is the Secret namespace. When empty, the consuming object's + own namespace is used unless the schema specifies otherwise. + type: string + required: + - name + type: object + key: + description: Key is the S3 object key path. + type: string + required: + - bucket + - credentialsSecretRef + - key + type: object + schedule: + description: |- + Schedule is the backup interval as a Go duration string (e.g., "24h", "6h"). + The reconciler creates a new EtcdMaintenance CR with operation=backup each time + the interval elapses. + type: string + required: + - clusterRef + - s3Destination + - schedule + type: object + status: + description: TalosEtcdBackupScheduleStatus defines the observed state + of TalosEtcdBackupSchedule. + properties: + conditions: + description: Conditions is the list of status conditions. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + lastBackupName: + description: LastBackupName is the name of the most recently created + EtcdMaintenance CR. + type: string + lastRunAt: + description: LastRunAt is the time the most recent EtcdMaintenance + CR was created. + format: date-time + type: string + nextRunAt: + description: NextRunAt is the time the next EtcdMaintenance CR will + be created. + format: date-time + type: string + observedGeneration: + description: ObservedGeneration is the generation of the spec last + reconciled. + format: int64 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/platform.ontai.dev_talosmachineconfigbackups.yaml b/config/crd/platform.ontai.dev_talosmachineconfigbackups.yaml new file mode 100644 index 0000000..04f5d3f --- /dev/null +++ b/config/crd/platform.ontai.dev_talosmachineconfigbackups.yaml @@ -0,0 +1,283 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: talosmachineconfigbackups.platform.ontai.dev +spec: + group: platform.ontai.dev + names: + kind: TalosMachineConfigBackup + listKind: TalosMachineConfigBackupList + plural: talosmachineconfigbackups + shortNames: + - mcb + singular: talosmachineconfigbackup + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.clusterRef.name + name: Cluster + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + TalosMachineConfigBackup triggers a machine config backup for all nodes of a target + cluster. The Conductor executor reads each node's running config via GetMachineConfig + and uploads it to S3 at {cluster}/machineconfigs/{TIMESTAMP}/{hostname}.yaml. + Named Conductor capability: machineconfig-backup. platform-schema.md §11. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TalosMachineConfigBackupSpec defines the desired state of + TalosMachineConfigBackup. + properties: + clusterRef: + description: ClusterRef references the TalosCluster whose node machine + configs are to be backed up. + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + lineage: + description: |- + Lineage is the sealed causal chain record for this root declaration. + Authored once at object creation time and immutable thereafter. + seam-core-schema.md §5, CLAUDE.md §14 Decision 1. + properties: + creatingOperator: + description: |- + CreatingOperator identifies the Seam Operator that created this object. + This is a structured identity carrying the operator name and its deployed + version at creation time. + properties: + name: + description: |- + Name is the canonical name of the Seam Operator (e.g., platform, guardian, + wrapper, conductor). + type: string + version: + description: |- + Version is the deployed version of the operator at the time the object was + created (e.g., v1.26.5-r3). This allows audit tooling to correlate objects + with the operator version that produced them. + type: string + required: + - name + - version + type: object + creationRationale: + description: |- + CreationRationale is the reason this object was created, drawn from the + Seam Core controlled vocabulary defined in rationale.go. It is not a + free-text field. + enum: + - ClusterProvision + - ClusterDecommission + - SecurityEnforcement + - PackExecution + - VirtualizationFulfillment + - ConductorAssignment + - VortexBinding + type: string + rootGenerationAtCreation: + description: |- + RootGenerationAtCreation is the metadata.generation of the root declaration + at the time this object was created. Together with RootUID, it provides a + complete temporal anchor for the derivation record. + format: int64 + type: integer + rootKind: + description: |- + RootKind is the kind of the root declaration that caused this object to + exist (e.g., TalosCluster, PackExecution, RBACPolicy). + type: string + rootName: + description: RootName is the name of the root declaration. + type: string + rootNamespace: + description: RootNamespace is the namespace of the root declaration. + type: string + rootUID: + description: |- + RootUID is the UID of the root declaration at the time this object was + created. Used to verify that no root declaration replacement has occurred. + type: string + required: + - creatingOperator + - creationRationale + - rootGenerationAtCreation + - rootKind + - rootName + - rootNamespace + - rootUID + type: object + s3BackupSecretRef: + description: |- + S3BackupSecretRef references a Secret containing S3 backup credentials for this + operation. Takes precedence over the cluster-wide seam-etcd-backup-config Secret + in seam-system. platform-schema.md §10. + properties: + name: + description: name is unique within a namespace to reference a + secret resource. + type: string + namespace: + description: namespace defines the space within which the secret + name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + s3Destination: + description: |- + S3Destination is the S3 location to write node machine configs to. + The bucket is required. The key prefix is auto-generated as: + {cluster}/machineconfigs/{TIMESTAMP}/{hostname}.yaml + properties: + bucket: + description: Bucket is the S3 bucket name. + type: string + credentialsSecretRef: + description: |- + CredentialsSecretRef references the Secret containing S3 credentials. + The Secret must be in ont-system. + properties: + name: + description: Name is the Secret name. + type: string + namespace: + description: |- + Namespace is the Secret namespace. When empty, the consuming object's + own namespace is used unless the schema specifies otherwise. + type: string + required: + - name + type: object + key: + description: Key is the S3 object key path. + type: string + required: + - bucket + - credentialsSecretRef + - key + type: object + required: + - clusterRef + - s3Destination + type: object + status: + description: TalosMachineConfigBackupStatus defines the observed state + of TalosMachineConfigBackup. + properties: + conditions: + description: |- + Conditions is the list of status conditions for this TalosMachineConfigBackup. + Condition types: Ready, Running, Degraded, S3DestinationAbsent, LineageSynced. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + jobName: + description: JobName is the name of the most recently submitted Conductor + executor Job. + type: string + observedGeneration: + description: ObservedGeneration is the generation of the spec last + reconciled. + format: int64 + type: integer + operationResult: + description: OperationResult is the message from the Conductor OperationResult + ConfigMap. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/platform.ontai.dev_talosmachineconfigbackupschedules.yaml b/config/crd/platform.ontai.dev_talosmachineconfigbackupschedules.yaml new file mode 100644 index 0000000..c438a6e --- /dev/null +++ b/config/crd/platform.ontai.dev_talosmachineconfigbackupschedules.yaml @@ -0,0 +1,219 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: talosmachineconfigbackupschedules.platform.ontai.dev +spec: + group: platform.ontai.dev + names: + kind: TalosMachineConfigBackupSchedule + listKind: TalosMachineConfigBackupScheduleList + plural: talosmachineconfigbackupschedules + shortNames: + - mcbs + singular: talosmachineconfigbackupschedule + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.clusterRef.name + name: Cluster + type: string + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .status.nextRunAt + name: NextRun + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + TalosMachineConfigBackupSchedule creates TalosMachineConfigBackup CRs on a repeating + interval. The schedule field accepts Go duration strings (e.g. "24h"). + platform-schema.md §11. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TalosMachineConfigBackupScheduleSpec defines the desired + state of TalosMachineConfigBackupSchedule. + properties: + clusterRef: + description: ClusterRef references the TalosCluster to back up on + schedule. + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + s3BackupSecretRef: + description: |- + S3BackupSecretRef references a Secret containing S3 backup credentials. + Falls back to seam-etcd-backup-config in seam-system when absent. + platform-schema.md §10. + properties: + name: + description: name is unique within a namespace to reference a + secret resource. + type: string + namespace: + description: namespace defines the space within which the secret + name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + s3Destination: + description: |- + S3Destination is the S3 location to write node machine configs to. + The bucket is required. + properties: + bucket: + description: Bucket is the S3 bucket name. + type: string + credentialsSecretRef: + description: |- + CredentialsSecretRef references the Secret containing S3 credentials. + The Secret must be in ont-system. + properties: + name: + description: Name is the Secret name. + type: string + namespace: + description: |- + Namespace is the Secret namespace. When empty, the consuming object's + own namespace is used unless the schema specifies otherwise. + type: string + required: + - name + type: object + key: + description: Key is the S3 object key path. + type: string + required: + - bucket + - credentialsSecretRef + - key + type: object + schedule: + description: |- + Schedule is the backup interval as a Go duration string (e.g., "24h", "6h", "1h"). + The reconciler creates a new TalosMachineConfigBackup CR each time the interval elapses. + type: string + required: + - clusterRef + - s3Destination + - schedule + type: object + status: + description: TalosMachineConfigBackupScheduleStatus defines the observed + state of TalosMachineConfigBackupSchedule. + properties: + conditions: + description: Conditions is the list of status conditions. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + lastBackupName: + description: LastBackupName is the name of the most recently created + TalosMachineConfigBackup CR. + type: string + lastRunAt: + description: LastRunAt is the time the most recent backup CR was created. + format: date-time + type: string + nextRunAt: + description: NextRunAt is the time the next backup CR will be created. + format: date-time + type: string + observedGeneration: + description: ObservedGeneration is the generation of the spec last + reconciled. + format: int64 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/platform.ontai.dev_talosmachineconfigrestores.yaml b/config/crd/platform.ontai.dev_talosmachineconfigrestores.yaml new file mode 100644 index 0000000..620b61d --- /dev/null +++ b/config/crd/platform.ontai.dev_talosmachineconfigrestores.yaml @@ -0,0 +1,285 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: talosmachineconfigrestores.platform.ontai.dev +spec: + group: platform.ontai.dev + names: + kind: TalosMachineConfigRestore + listKind: TalosMachineConfigRestoreList + plural: talosmachineconfigrestores + shortNames: + - mcr + singular: talosmachineconfigrestore + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.clusterRef.name + name: Cluster + type: string + - jsonPath: .spec.backupTimestamp + name: Timestamp + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + TalosMachineConfigRestore triggers a machine config restore for target nodes of a + cluster. The Conductor executor downloads each node's config from S3 at + {cluster}/machineconfigs/{backupTimestamp}/{hostname}.yaml and applies it via + ApplyConfiguration. Named Conductor capability: machineconfig-restore. + platform-schema.md §11. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TalosMachineConfigRestoreSpec defines the desired state of + TalosMachineConfigRestore. + properties: + backupTimestamp: + description: |- + BackupTimestamp identifies which backup to restore from. Must match the + timestamp component of the S3 path written by a prior machineconfig-backup + operation: {cluster}/machineconfigs/{backupTimestamp}/{hostname}.yaml. + Format: 20060102T150405Z (UTC). + type: string + clusterRef: + description: |- + ClusterRef references the TalosCluster whose nodes will have their machine + config restored. + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + lineage: + description: Lineage is the sealed causal chain record for this root + declaration. + properties: + creatingOperator: + description: |- + CreatingOperator identifies the Seam Operator that created this object. + This is a structured identity carrying the operator name and its deployed + version at creation time. + properties: + name: + description: |- + Name is the canonical name of the Seam Operator (e.g., platform, guardian, + wrapper, conductor). + type: string + version: + description: |- + Version is the deployed version of the operator at the time the object was + created (e.g., v1.26.5-r3). This allows audit tooling to correlate objects + with the operator version that produced them. + type: string + required: + - name + - version + type: object + creationRationale: + description: |- + CreationRationale is the reason this object was created, drawn from the + Seam Core controlled vocabulary defined in rationale.go. It is not a + free-text field. + enum: + - ClusterProvision + - ClusterDecommission + - SecurityEnforcement + - PackExecution + - VirtualizationFulfillment + - ConductorAssignment + - VortexBinding + type: string + rootGenerationAtCreation: + description: |- + RootGenerationAtCreation is the metadata.generation of the root declaration + at the time this object was created. Together with RootUID, it provides a + complete temporal anchor for the derivation record. + format: int64 + type: integer + rootKind: + description: |- + RootKind is the kind of the root declaration that caused this object to + exist (e.g., TalosCluster, PackExecution, RBACPolicy). + type: string + rootName: + description: RootName is the name of the root declaration. + type: string + rootNamespace: + description: RootNamespace is the namespace of the root declaration. + type: string + rootUID: + description: |- + RootUID is the UID of the root declaration at the time this object was + created. Used to verify that no root declaration replacement has occurred. + type: string + required: + - creatingOperator + - creationRationale + - rootGenerationAtCreation + - rootKind + - rootName + - rootNamespace + - rootUID + type: object + s3BackupSecretRef: + description: |- + S3BackupSecretRef references a Secret containing S3 credentials. + Falls back to seam-etcd-backup-config in seam-system when absent. + platform-schema.md §10. + properties: + name: + description: name is unique within a namespace to reference a + secret resource. + type: string + namespace: + description: namespace defines the space within which the secret + name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + s3SourceBucket: + description: |- + S3SourceBucket is the S3 bucket containing the backup objects. Must match + the bucket used during the original machineconfig-backup operation. + type: string + targetNodes: + description: |- + TargetNodes is the optional list of node hostnames to restore. When empty + all nodes in the cluster are restored. When set only the listed hostnames + are restored. + items: + type: string + type: array + required: + - backupTimestamp + - clusterRef + - s3SourceBucket + type: object + status: + description: TalosMachineConfigRestoreStatus defines the observed state + of TalosMachineConfigRestore. + properties: + conditions: + description: |- + Conditions is the list of status conditions for this TalosMachineConfigRestore. + Condition types: Ready, Running, Degraded, S3SourceAbsent, LineageSynced. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + jobName: + description: JobName is the name of the most recently submitted Conductor + executor Job. + type: string + observedGeneration: + description: ObservedGeneration is the generation of the spec last + reconciled. + format: int64 + type: integer + operationResult: + description: OperationResult is the message from the Conductor OperationResult + ConfigMap. + type: string + phase: + description: |- + Phase is the current phase of the restore operation. + One of: Pending, Running, Succeeded, Failed, PartiallyFailed. + type: string + restoredNodes: + description: RestoredNodes is the list of node hostnames successfully + restored. + items: + type: string + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/seam.ontai.dev_clusterlogs.yaml b/config/crd/seam.ontai.dev_clusterlogs.yaml new file mode 100644 index 0000000..856e68c --- /dev/null +++ b/config/crd/seam.ontai.dev_clusterlogs.yaml @@ -0,0 +1,194 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + name: clusterlogs.seam.ontai.dev +spec: + group: seam.ontai.dev + names: + kind: ClusterLog + listKind: ClusterLogList + plural: clusterlogs + shortNames: + - clog + singular: clusterlog + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.clusterRef + name: Cluster + type: string + - jsonPath: .spec.talosVersion + name: TalosVersion + type: string + - jsonPath: .spec.revision + name: Revision + type: integer + - jsonPath: .spec.operationCount + name: Ops + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ClusterLog accumulates the day-2 operation history for one cluster. One CR per + cluster, created when the platform operator provisions the cluster tenant namespace. + Operations are appended by the Conductor execute-mode Job. On talosVersion upgrade, + the current revision is archived to the GraphQuery DB and a new revision epoch begins. + + Named by the cluster name. Lives in seam-tenant-{clusterRef}. + conductor-schema.md §8. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + ClusterLogSpec is the accumulated day-2 operation history for one cluster, + scoped to the current talosVersion revision. + + One CR per cluster. Created by the platform operator when the cluster tenant + namespace is provisioned. Named by the cluster name. Lives in seam-tenant-{clusterRef}. + + When the cluster talosVersion is upgraded, the current revision is archived to + the GraphQuery DB and a new revision begins: Revision increments, TalosVersion + is updated, and Operations is cleared. + + conductor-schema.md §8. + properties: + clusterRef: + description: ClusterRef is the name of the TalosCluster this log accumulates. + type: string + operationCount: + description: |- + OperationCount is the count of records in Operations for the current revision. + Maintained by the writer alongside Operations so kubectl can display it + as an integer column. Updated atomically with every Operations write. + json tag intentionally omits omitempty so the writer always serializes 0. + format: int64 + type: integer + operations: + additionalProperties: + description: |- + OperationRecord is a single day-2 operation record within one + talosVersion revision. Multiple records accumulate in the parent ClusterLog as + operations are performed against the cluster. + properties: + capability: + description: Capability is the conductor capability that produced + this record. + type: string + completedAt: + description: CompletedAt is the time the capability execution + finished. + format: date-time + type: string + failureReason: + description: FailureReason is populated when Status is Failed. + Nil on success. + properties: + category: + description: Category classifies the failure domain. + enum: + - ValidationFailure + - CapabilityUnavailable + - ExecutionFailure + - ExternalDependencyFailure + - InvariantViolation + type: string + reason: + description: Reason is a human-readable description of the + failure. + type: string + required: + - category + - reason + type: object + jobRef: + description: |- + JobRef is the Kubernetes Job name that produced this record. + The platform reconciler uses this to correlate the record with the Job it submitted. + type: string + message: + description: Message provides a human-readable summary of the + outcome. + type: string + startedAt: + description: StartedAt is the time the capability execution + began. + format: date-time + type: string + status: + allOf: + - enum: + - Succeeded + - Failed + - enum: + - Succeeded + - Failed + description: Status is the terminal status of the capability + execution. + type: string + required: + - capability + - jobRef + - status + type: object + description: |- + Operations is the map of day-2 operation records for the current revision, + keyed by Kubernetes Job name. Map keying enables O(1) lookup by the platform + reconciler and clean serialization when archiving the revision to the GraphQuery DB. + type: object + revision: + default: 1 + description: |- + Revision is the monotonic revision counter. Starts at 1. Increments on each + talosVersion upgrade. Each revision holds the operations performed during that + version epoch. Archived revisions are stored in the GraphQuery DB. + format: int64 + type: integer + talosVersion: + description: |- + TalosVersion is the cluster talosVersion for the current active revision. + Matches TalosCluster.spec.talosVersion at the time this revision began. + type: string + required: + - clusterRef + - revision + - talosVersion + type: object + status: + description: |- + ClusterLogStatus is the observed state. + Currently empty; reserved for future conditions. + properties: + observedGeneration: + description: ObservedGeneration is the last generation observed by + any consumer. + format: int64 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/seam.ontai.dev_talosclusters.yaml b/config/crd/seam.ontai.dev_talosclusters.yaml new file mode 100644 index 0000000..209ea4e --- /dev/null +++ b/config/crd/seam.ontai.dev_talosclusters.yaml @@ -0,0 +1,404 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + labels: + infrastructure.ontai.dev/lineage-root: "true" + name: talosclusters.seam.ontai.dev +spec: + group: seam.ontai.dev + names: + kind: TalosCluster + listKind: TalosClusterList + plural: talosclusters + shortNames: + - tc + singular: taloscluster + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.mode + name: Mode + type: string + - jsonPath: .spec.role + name: Role + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + TalosCluster is the platform CRD for a Talos cluster under Seam governance. + platform-schema.md §4. Decision H. seam.ontai.dev/v1alpha1. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + TalosClusterSpec is the declared desired state of a TalosCluster. + platform-schema.md §4. + properties: + capi: + description: CAPI holds CAPI integration settings. When absent, the + cluster uses direct bootstrap. + properties: + ciliumPackRef: + description: |- + CiliumPackRef references the cluster-specific Cilium PackDelivery. + Applied as the first pack after the CAPI cluster reaches Running state. + platform-schema.md §2.3. + properties: + name: + description: Name is the PackDelivery CR name for the Cilium + pack. + type: string + version: + description: Version is the PackDelivery version string. + type: string + required: + - name + - version + type: object + controlPlane: + description: ControlPlane holds control plane configuration. Required + when Enabled=true. + properties: + replicas: + description: Replicas is the desired number of control plane + nodes. + format: int32 + type: integer + type: object + enabled: + description: Enabled determines whether this TalosCluster uses the CAPI path. + type: boolean + kubernetesVersion: + description: KubernetesVersion is the Kubernetes version for TalosControlPlane. + type: string + talosVersion: + description: |- + TalosVersion is the Talos version to use for TalosConfigTemplate generation. + type: string + workers: + description: Workers is the list of worker node pools. + items: + description: CAPIWorkerPool declares a worker node pool for a CAPI-managed target cluster. + properties: + name: + description: Name is the pool identifier. Used as the MachineDeployment + name suffix. + type: string + replicas: + description: Replicas is the desired number of worker nodes + in this pool. + format: int32 + type: integer + seamInfrastructureMachineNames: + description: |- + SeamInfrastructureMachineNames lists the SeamInfrastructureMachine CR names + pre-provisioned for this pool. One per node. + items: + type: string + type: array + required: + - name + type: object + type: array + required: + - enabled + type: object + clusterEndpoint: + description: ClusterEndpoint is the cluster VIP or primary API endpoint IP. + type: string + hardeningProfileRef: + description: |- + HardeningProfileRef references a HardeningProfile CR to apply at bootstrap. + platform-schema.md §11. + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + infrastructureProvider: + allOf: + - enum: + - native + - capi + - screen + - enum: + - native + - capi + - screen + default: native + description: |- + InfrastructureProvider declares the infrastructure provider backing this cluster. + Defaults to native when absent. The only reserved future value is screen (INV-021). + type: string + kubeconfigSecretRef: + description: |- + KubeconfigSecretRef is the name of the Secret containing the kubeconfig for this cluster. + Required on mode=import. Not used when CAPI manages the cluster lifecycle. + type: string + kubernetesVersion: + description: |- + KubernetesVersion is the Kubernetes version for this cluster. When + spec.versionUpgrade=true, setting this field drives an UpgradeTypeKubernetes + UpgradePolicy. Setting both talosVersion and kubernetesVersion drives an + UpgradeTypeStack policy (sequential Talos then Kubernetes upgrade). + type: string + lineage: + description: Lineage is the sealed causal chain record for this root + declaration. Immutable after creation. + properties: + creatingOperator: + description: |- + CreatingOperator identifies the Seam Operator that created this object. + This is a structured identity carrying the operator name and its deployed + version at creation time. + properties: + name: + description: |- + Name is the canonical name of the Seam Operator (e.g., platform, guardian, + wrapper, conductor). + type: string + version: + description: |- + Version is the deployed version of the operator at the time the object was + created (e.g., v1.26.5-r3). This allows audit tooling to correlate objects + with the operator version that produced them. + type: string + required: + - name + - version + type: object + creationRationale: + description: |- + CreationRationale is the reason this object was created, drawn from the + Seam Core controlled vocabulary defined in rationale.go. It is not a + free-text field. + enum: + - ClusterProvision + - ClusterDecommission + - SecurityEnforcement + - PackExecution + - VirtualizationFulfillment + - ConductorAssignment + - VortexBinding + type: string + rootGenerationAtCreation: + description: |- + RootGenerationAtCreation is the metadata.generation of the root declaration + at the time this object was created. Together with RootUID, it provides a + complete temporal anchor for the derivation record. + format: int64 + type: integer + rootKind: + description: |- + RootKind is the kind of the root declaration that caused this object to + exist (e.g., TalosCluster, PackExecution, RBACPolicy). + type: string + rootName: + description: RootName is the name of the root declaration. + type: string + rootNamespace: + description: RootNamespace is the namespace of the root declaration. + type: string + rootUID: + description: |- + RootUID is the UID of the root declaration at the time this object was + created. Used to verify that no root declaration replacement has occurred. + type: string + required: + - creatingOperator + - creationRationale + - rootGenerationAtCreation + - rootKind + - rootName + - rootNamespace + - rootUID + type: object + mode: + allOf: + - enum: + - bootstrap + - import + - enum: + - bootstrap + - import + description: Mode declares whether this cluster is bootstrapped from + scratch or imported. + type: string + nodeAddresses: + description: NodeAddresses is the list of node IPs for DSNSReconciler A-record population. + items: + type: string + type: array + pkiRotationThresholdDays: + default: 30 + description: |- + PkiRotationThresholdDays is the days before cert expiry at which a PKIRotation + CR is auto-created. Default 30. platform-schema.md §13. + format: int32 + minimum: 1 + type: integer + role: + allOf: + - enum: + - management + - tenant + - enum: + - management + - tenant + description: Role declares the cluster role in the Seam topology. + Mandatory on mode=import. + type: string + talosVersion: + description: TalosVersion is the Talos OS version for this cluster. INV-012. + type: string + talosconfigSecretRef: + description: TalosconfigSecretRef is the name of the Secret containing + the talosconfig for this cluster. + type: string + versionUpgrade: + description: |- + VersionUpgrade, when set to true, triggers a cluster-level rolling upgrade. + Upgrade type is derived from which version fields are set: + talosVersion only: UpgradeTypeTalos + kubernetesVersion only: UpgradeTypeKubernetes + both: UpgradeTypeStack (sequential Talos then k8s) + type: boolean + required: + - mode + type: object + x-kubernetes-validations: + - message: role is required when mode is import + rule: self.mode != 'import' || (has(self.role) && self.role != '') + status: + description: TalosClusterStatus is the observed state of a TalosCluster. + properties: + capiClusterRef: + description: |- + CAPIClusterRef is a reference to the owned CAPI Cluster object. + Only set for CAPI-managed clusters (capi.enabled=true). + properties: + name: + description: Name is the object name. + type: string + namespace: + description: Namespace is the object namespace. May be empty for + cluster-scoped objects. + type: string + required: + - name + type: object + conditions: + description: Conditions is the list of status conditions for this + TalosCluster. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + observedGeneration: + description: ObservedGeneration is the generation most recently reconciled. + format: int64 + type: integer + observedTalosVersion: + description: ObservedTalosVersion is the Talos version last confirmed running. + type: string + origin: + description: Origin records how this cluster came under Seam governance. + enum: + - bootstrapped + - imported + type: string + pkiExpiryDate: + description: |- + PkiExpiryDate is the earliest certificate expiry across the talosconfig and + kubeconfig Secrets. Set by the TalosCluster reconciler. platform-schema.md §13. + format: date-time + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/docs/platform-schema.md b/docs/platform-schema.md index 341a262..25945c3 100644 --- a/docs/platform-schema.md +++ b/docs/platform-schema.md @@ -1,794 +1,635 @@ # platform-schema -> API Group: platform.ontai.dev (operational CRDs: TalosControlPlane, TalosWorkerConfig, EtcdMaintenance, NodeMaintenance, PKIRotation, ClusterReset, HardeningProfile, UpgradePolicy, NodeOperation, ClusterMaintenance, PlatformTenant, QueueProfile, MaintenanceBundle) -> InfrastructureTalosCluster: infrastructure.ontai.dev/v1alpha1 -- schema owned by seam-core (Decision G). Platform reconciles it; does not define it. -> Operator: Platform -> CAPI Providers: cluster.x-k8s.io, bootstrap.cluster.x-k8s.io, infrastructure.cluster.x-k8s.io -> Amended: 2026-03-30 - CAPI adopted for target cluster lifecycle. Management cluster -> bootstrap unchanged. SeamInfrastructureMachine CRD introduced. Kueue scoped to -> Wrapper quota profile. Operational CRDs retained where CAPI has no equivalent. +> API Group: seam.ontai.dev/v1alpha1 (TalosCluster, ClusterLog -- cross-operator CRDs) +> API Group: platform.ontai.dev/v1alpha1 (all day-2 operational CRDs) +> API Group: infrastructure.cluster.x-k8s.io (CAPI types -- frozen, out of scope) +> Operator: platform +> Schema authority: this file (primary). ~/ontai/seam/docs/seam-schema.md (RunnerConfig). ~/ontai/conductor/docs/conductor-schema.md (capabilities). ~/ontai/guardian/docs/guardian-schema.md (RBACProfile gate). ~/ontai/dispatcher/docs/dispatcher-schema.md (PackInstalled gate for Cilium). --- ## 1. Domain Boundary -Platform owns the complete lifecycle of Talos clusters and all tenant -coordination. It does this by composing CAPI primitives for target cluster -lifecycle while preserving Seam's governing principles - declarative, versioned, -auditable, and security-first. - -Platform is the CAPI management plane operator. It creates and owns CAPI -objects (Cluster, TalosControlPlane, MachineDeployment, SeamInfrastructureMachine) -as children of the ONT TalosCluster CR. CAPI controllers reconcile those objects -to actual cluster state through the Seam Infrastructure Provider and CABPT. - -**What changes with CAPI adoption:** -- Target cluster lifecycle (bootstrap, upgrade, scale, health) is delegated to CAPI. -- The Seam Infrastructure Provider (part of Platform) delivers machineconfigs - to nodes on port 50000 - it is the Talos-specific infrastructure layer. -- Kueue Jobs are no longer used for cluster lifecycle operations. -- Kueue is retained as a prerequisite exclusively for Wrapper pack-deploy Jobs. -- CAPI provides the observability (Machine status, Cluster conditions, events) - that Kueue Jobs previously provided for cluster operations. - -**What does not change:** -- Management cluster bootstrap remains Seam-native. CAPI cannot bootstrap the - cluster it runs on. See Section 3 for the unchanged management cluster path. -- All Seam security plane rules. CAPI's RBAC goes through Guardian intake. -- Guardian deploys before CAPI. CAPI is installed in the enable phase after - Guardian is operational. -- TalosCluster is still the Seam root CR for every cluster. CAPI objects are - children of TalosCluster, not the other way around. -- Operational CRDs with no CAPI equivalent remain and use Conductor capabilities - invoked via direct controller reconciliation. -- Platform creates tenant namespaces. Sole namespace authority unchanged. +Platform owns the complete lifecycle of Talos clusters and all day-2 operational coordination. It does this by composing CAPI primitives for target cluster lifecycle while preserving Seam governing principles: declarative, versioned, auditable, and security-first. + +Platform is the CAPI management plane operator. It creates and owns CAPI objects (SeamInfrastructureCluster, cluster.x-k8s.io/Cluster, TalosControlPlane, MachineDeployment, TalosConfigTemplate, SeamInfrastructureMachineTemplate) as children of TalosCluster for target clusters. CAPI controllers reconcile those objects to actual cluster state through the Seam Infrastructure Provider and CABPT. + +What does not change from the pre-CAPI model: + +- Management cluster bootstrap remains Seam-native. CAPI cannot bootstrap the cluster it runs on. +- TalosCluster is still the Seam root CR for every cluster. CAPI objects are children of TalosCluster, not the other way around. +- Operational CRDs with no CAPI equivalent remain and use Conductor capabilities via direct controller reconciliation. +- Platform creates tenant namespaces. CP-INV-004 applies without exception. +- Guardian deploys before platform. Platform starts only after Guardian RBACProfile reaches provisioned=true (CP-INV-012). --- -## 2. CAPI Provider Architecture +## 2. Master GVK Reference -### 2.1 Providers Installed on Management Cluster +### seam.ontai.dev/v1alpha1 -**CAPI Core** (cluster.x-k8s.io) - Cluster, Machine, MachineDeployment, -MachineSet, MachineHealthCheck controllers. These are the battle-tested cluster -lifecycle primitives. Installed via OperatorManifest in the enable phase, after -Guardian. +These types are defined in platform/api/seam/v1alpha1/ and are schema-shared across the platform and seam modules. Platform reconciles them; seam is the canonical source of the type definitions for cross-operator consumption. -**CABPT** (bootstrap.cluster.x-k8s.io) - Cluster API Bootstrap Provider Talos. -Generates TalosConfig and renders machineconfigs per Machine. Patches TalosConfig -with cluster-specific CNI=none and kernel parameters needed for Cilium. CABPT is -the source of rendered machineconfig data that the Seam Infrastructure Provider -delivers to nodes. +| Kind | Short | Scope | Namespace | +|------|-------|-------|-----------| +| TalosCluster | tc | Namespaced | seam-system (management), seam-tenant-{cluster-name} (target) | +| ClusterLog | clog | Namespaced | seam-tenant-{cluster-name} | -**Seam Infrastructure Provider** - a purpose-built Platform component that -implements the CAPI InfrastructureCluster and InfrastructureMachine contracts. -It does not call any cloud API. It watches SeamInfrastructureCluster and -SeamInfrastructureMachine objects and delivers machineconfigs to pre-provisioned -Talos nodes on port 50000 using the talos goclient embedded in the provider binary. -This is the only place in Platform that uses the talos goclient after bootstrap. -The provider is a distroless Go binary - talos goclient + kube goclient only. +### platform.ontai.dev/v1alpha1 -### 2.2 CAPI Object Ownership +All day-2 operational CRDs are owned exclusively by platform. -Platform's TalosCluster reconciler creates and owns: -- SeamInfrastructureCluster (infra reference for the CAPI Cluster) -- cluster.x-k8s.io/Cluster (owns TalosControlPlane and MachineDeployments) -- TalosControlPlane (CACPPT - control plane management) -- MachineDeployment per node role (control plane, worker) -- TalosConfigTemplate (CABPT - machineconfig generation template with CNI patches) -- SeamInfrastructureMachineTemplate (template for SeamInfrastructureMachine per node) +| Kind | Short | Scope | Conductor capabilities | +|------|-------|-------|------------------------| +| EtcdMaintenance | em | Namespaced | etcd-backup, etcd-restore, etcd-defrag | +| TalosEtcdBackupSchedule | etcdbs | Namespaced | (schedule controller; creates EtcdMaintenance CRs) | +| NodeMaintenance | nm | Namespaced | node-patch, hardening-apply, credential-rotate | +| NodeOperation | nop | Namespaced | node-scale-up, node-decommission, node-reboot (non-CAPI path only) | +| PKIRotation | pkir | Namespaced | pki-rotate | +| ClusterReset | crst | Namespaced | cluster-reset | +| ClusterMaintenance | cmaint | Namespaced | (no Job; CAPI pause or Conductor gate) | +| UpgradePolicy | upgp | Namespaced | talos-upgrade, kube-upgrade, stack-upgrade (non-CAPI path only) | +| HardeningProfile | hp | Namespaced | (configuration CR; no Job submission) | +| MaintenanceBundle | mb | Namespaced | drain, upgrade, etcd-backup, machineconfig-rotation | +| TalosMachineConfigBackup | mcb | Namespaced | machineconfig-backup | +| TalosMachineConfigBackupSchedule | mcbs | Namespaced | (schedule controller; creates TalosMachineConfigBackup CRs) | +| TalosMachineConfigRestore | mcr | Namespaced | machineconfig-restore | -These are all created in the tenant-{cluster-name} namespace and owned by the -TalosCluster CR via ownerReference. Deleting TalosCluster cascades to all owned -CAPI objects through Kubernetes garbage collection, which triggers CAPI's own -deletion reconciliation. Seam finalizers on TalosCluster gate this to ensure -security plane cleanup happens before cascade. +### infrastructure.cluster.x-k8s.io (CAPI -- frozen) -### 2.3 Cilium CNI Integration +| Kind | Short | Purpose | +|------|-------|---------| +| SeamInfrastructureCluster | sic | Cluster-level CAPI InfrastructureCluster implementation | +| SeamInfrastructureMachine | sim | Per-node CAPI InfrastructureMachine implementation | -Every TalosConfigTemplate created by Platform includes: -- cluster.network.cni.name: none (disables default CNI, required for Cilium) -- BPF kernel parameters in machine config patches -- Cilium-required sysctl values +CAPI types are frozen. Platform implements the CAPI contracts for these types through the Seam Infrastructure Provider but does not modify their schemas. -After CAPI bootstraps the cluster (nodes reach Running state but are NotReady -because no CNI is present), Platform triggers a PackExecution for the Cilium -ClusterPack referenced by spec.capi.ciliumPackRef. This is the first pack deployed -to every cluster. Nodes transition to Ready only after Cilium is up. +--- -The CAPI MachineHealthCheck is configured with a tolerance window for the CNI -installation period - nodes are not remediated during this window. +## 3. TalosCluster (seam.ontai.dev/v1alpha1) -The Cilium ClusterPack is compiled per-cluster on the workstation with values -specific to the cluster endpoint, IPAM mode, L2 announcement configuration, MTU, -and routing mode. It is not a generic pack - it carries the cluster endpoint -address at compile time. +Scope: Namespaced -- seam-system (management cluster) or seam-tenant-{cluster-name} (target clusters) +Short name: tc +Print columns: Mode, Role, Ready, Age + +The Seam root CR for every cluster. For target clusters, TalosCluster owns all CAPI objects as children via ownerReference (CP-INV-008). For the management cluster, TalosCluster has no CAPI children. + +Deletion of a TalosCluster CR never triggers physical cluster destruction (INV-015). ClusterReset is the only destruction path. + +### spec fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| mode | string (bootstrap, import) | yes | bootstrap: cluster is formed from scratch. import: existing cluster brought under Seam governance. | +| role | string (management, tenant) | required when mode=import | Declares cluster role in Seam topology. | +| talosVersion | string | no | Talos OS version for this cluster. Must match RunnerConfig.agentImage tag (INV-012). | +| kubernetesVersion | string | no | Kubernetes version for this cluster. When versionUpgrade=true, drives an UpgradeTypeKubernetes policy. | +| versionUpgrade | bool | no | When true, triggers a cluster-level rolling upgrade. Upgrade type derived from which version fields are set: talosVersion only = UpgradeTypeTalos; kubernetesVersion only = UpgradeTypeKubernetes; both = UpgradeTypeStack. | +| clusterEndpoint | string | no | Cluster VIP or primary API endpoint IP. | +| nodeAddresses | []string | no | Node IPs for DNS A-record population. | +| capi | CAPIConfig | no | CAPI integration settings. When absent, direct bootstrap path is used. | +| infrastructureProvider | string (native, capi, screen) | no | Default: native. screen is reserved (INV-021). | +| kubeconfigSecretRef | string | no | Name of the Secret containing the kubeconfig. Required on mode=import. Not used when CAPI manages lifecycle. | +| talosconfigSecretRef | string | no | Name of the Secret containing the talosconfig. | +| lineage | SealedCausalChain | no | Sealed causal chain record. Immutable after creation (Decision 1). | +| pkiRotationThresholdDays | int32 | no | Days before cert expiry at which a PKIRotation CR is auto-created. Default 30, minimum 1. | +| hardeningProfileRef | LocalObjectRef | no | HardeningProfile CR to apply at bootstrap. | + +### spec.capi fields (CAPIConfig) + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| enabled | bool | yes (within capi block) | True for all target clusters. False for management cluster. | +| talosVersion | string | no | Talos version for TalosConfigTemplate generation. | +| kubernetesVersion | string | no | Kubernetes version for TalosControlPlane. | +| controlPlane | CAPIControlPlaneConfig | no | Control plane configuration. Required when enabled=true. | +| controlPlane.replicas | int32 | no | Desired number of control plane nodes. | +| workers | []CAPIWorkerPool | no | Worker node pools. | +| workers[].name | string | yes | Pool identifier. Used as MachineDeployment name suffix. | +| workers[].replicas | int32 | no | Desired number of worker nodes in this pool. | +| workers[].seamInfrastructureMachineNames | []string | no | SeamInfrastructureMachine CR names pre-provisioned for this pool. | +| ciliumPackRef | CAPICiliumPackRef | no | PackDelivery name and version for the Cilium pack. Platform triggers a PackExecution for this pack when the CAPI Cluster reaches Running state. | + +### status fields + +| Field | Type | Description | +|-------|------|-------------| +| observedGeneration | int64 | Generation most recently reconciled. | +| origin | string (bootstrapped, imported) | How this cluster came under Seam governance. | +| observedTalosVersion | string | Talos version last confirmed running. | +| capiClusterRef | LocalObjectRef | Reference to the owned CAPI Cluster object. Only set for capi.enabled=true. | +| conditions | []metav1.Condition | Status conditions. | +| pkiExpiryDate | *metav1.Time | Earliest certificate expiry across talosconfig and kubeconfig Secrets. | + +### Status condition types + +| Condition | Meaning | +|-----------|---------| +| Ready | Cluster is fully operational. | +| Bootstrapping | Bootstrap Job submitted and running. | +| Bootstrapped | Bootstrap sequence complete. | +| Importing | Import sequence in progress. | +| Degraded | Cluster has entered a degraded state. | +| CiliumPending | CAPI cluster Running but Cilium PackInstance not yet Ready. Not a degraded state (CP-INV-013). | +| ControlPlaneUnreachable | Control plane API is not responding. | +| PartialWorkerAvailability | One or more worker nodes are not Ready. | +| ConductorReady | Conductor agent Deployment is running on the tenant cluster. | +| VersionUpgradePending | versionUpgrade=true and upgrade is queued. | +| VersionRegressionBlocked | A version downgrade was attempted and blocked. | +| HardeningApplied | HardeningProfile has been applied at bootstrap. | --- -## 3. Management Cluster Bootstrap - Unchanged +## 4. ClusterLog (seam.ontai.dev/v1alpha1) + +Scope: Namespaced -- seam-tenant-{clusterRef} +Short name: clog +Print columns: Cluster, TalosVersion, Revision, Ops, Age + +Accumulates the day-2 operation history for one cluster, scoped to the current talosVersion revision. One CR per cluster. Created by platform when the cluster tenant namespace is provisioned. Named by the cluster name. + +When the cluster talosVersion is upgraded, the current revision is archived to the GraphQuery DB and a new revision begins: Revision increments, TalosVersion is updated, and Operations is cleared. -Management cluster bootstrap does not use CAPI. CAPI cannot bootstrap the cluster -it runs on. The management cluster bootstrap path is: +Operations are appended by the Conductor execute-mode Job. The platform reconciler uses the JobRef field to correlate each record with the Job it submitted. -Human runs Compiler compile mode → generates machineconfigs, SOPS-encrypts -secrets → secrets committed to git → TalosCluster CR (mode: bootstrap) committed -to git → GitOps applies to a temporary Kubernetes context (or direct kubectl) → -Platform generates a bootstrap Job using compiler directly → conductor pushes -machineconfigs to seed nodes on port 50000 → etcd initializes → Kubernetes API -comes up → enable phase installs Guardian first, then CAPI providers and -remaining prerequisites, then other operators. +### spec fields -After the management cluster exists, CAPI is installed and manages only target -clusters. The management cluster's own TalosCluster CR in seam-system has -mode: bootstrap and no CAPI children - management cluster lifecycle is not -CAPI-managed. +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | string | yes | Name of the TalosCluster this log accumulates. | +| talosVersion | string | yes | Talos version for the current active revision. Matches TalosCluster.spec.talosVersion at revision start. | +| revision | int64 | yes | Monotonic revision counter. Starts at 1. Increments on each talosVersion upgrade. | +| operations | map[string]OperationRecord | no | Day-2 operation records for the current revision, keyed by Kubernetes Job name. | +| operationCount | int64 | no | Count of records in operations. Maintained alongside operations for kubectl display. | + +### OperationRecord fields + +| Field | Type | Description | +|-------|------|-------------| +| capability | string | Conductor capability that produced this record. | +| jobRef | string | Kubernetes Job name that produced this record. | +| status | string (Succeeded, Failed) | Terminal status of the capability execution. | +| message | string | Human-readable summary of the outcome. | +| startedAt | *metav1.Time | Time the capability execution began. | +| completedAt | *metav1.Time | Time the capability execution finished. | +| failureReason | *OperationFailureReason | Populated when status is Failed. | + +### OperationFailureReason fields + +| Field | Values | Description | +|-------|--------|-------------| +| category | ValidationFailure, CapabilityUnavailable, ExecutionFailure, ExternalDependencyFailure, InvariantViolation | Failure domain classification. | +| reason | string | Human-readable failure description. | --- -## 4. Seam Infrastructure CRDs +## 5. Operational CRD Catalog (platform.ontai.dev/v1alpha1) -### SeamInfrastructureMachine +All operational CRDs live in seam-tenant-{cluster-name} namespaces. All Conductor capabilities referenced here must be verified against conductor-schema.md before any implementation work begins. -Scope: Namespaced - tenant-{cluster-name} -Short name: sim -API group: infrastructure.cluster.x-k8s.io (CAPI infrastructure contract) +### EtcdMaintenance (shortName: em) -Wraps a pre-provisioned node IP address and its connection parameters. This is the -Seam-native implementation of the CAPI InfrastructureMachine contract. One -SeamInfrastructureMachine per node in the cluster. +Covers all etcd lifecycle operations. CAPI has no etcd concept. Always submits a direct Conductor executor Job regardless of the owning TalosCluster's capi.enabled. -The human (or GitOps) declares the available node IPs as SeamInfrastructureMachine -objects in the tenant namespace before the cluster is bootstrapped. The Seam -Infrastructure Provider watches for CAPI Machine objects that reference these and -delivers the CABPT-rendered machineconfig to the declared IP on port 50000. +Named Conductor capabilities: etcd-backup, etcd-restore, etcd-defrag. Key spec fields: -- address: the pre-provisioned node IP address reachable on port 50000. -- port: Talos maintenance API port. Default 50000. -- talosConfigSecretRef: reference to the talosconfig secret in ont-system that - the provider uses to authenticate the ApplyConfiguration call. -- nodeRole: controlplane or worker. Must match the MachineDeployment role. - -Status fields (set by the Seam Infrastructure Provider): -- ready: bool. Set to true after machineconfig is applied and the node transitions - out of maintenance mode. -- machineConfigApplied: bool. -- providerID: the provider ID string written back to the CAPI Machine object. - Format: talos://{cluster-name}/{node-ip} - -CAPI contract compliance: SeamInfrastructureMachine implements the InfrastructureMachine -contract by setting status.ready=true when the machine is provisioned, and writing -spec.providerID back to the owning Machine object. + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster this operation targets. | +| operation | string (backup, restore, defrag) | yes | Etcd lifecycle operation to perform. | +| etcdBackupS3SecretRef | *corev1.SecretReference | no | S3 credentials Secret. Takes precedence over cluster-wide seam-etcd-backup-config. Required for backup when no cluster default is configured. See S3 resolution hierarchy in section 8. | +| s3Destination | *S3Ref | no | S3 location to write the snapshot to. Required when operation=backup. | +| s3SnapshotPath | *S3Ref | no | S3 location of snapshot to restore from. Required when operation=restore. | +| targetNodes | []string | no | Nodes to target for restore. All etcd members when empty. | +| pvcFallbackEnabled | bool | no | Instructs reconciler to proceed with PVC-backed backup when no S3 destination is configured (degraded mode). See section 8. | +| schedule | string | no | Cron expression for recurring backup operations. | + +Status condition types: Ready, Running, Degraded. --- -### SeamInfrastructureCluster +### TalosEtcdBackupSchedule (shortName: etcdbs) -Scope: Namespaced - tenant-{cluster-name} -Short name: sic -API group: infrastructure.cluster.x-k8s.io +Schedule controller. Creates EtcdMaintenance CRs with operation=backup on a repeating interval. The schedule field accepts Go duration strings (e.g. "24h", "6h"). -The cluster-level CAPI infrastructure reference. Holds the cluster endpoint and -any cluster-wide infrastructure parameters. One per cluster. Owned by the CAPI -Cluster object. +No Conductor Job submitted directly. All actual work is delegated to the EtcdMaintenance CRs this controller creates. -Key spec fields: -- controlPlaneEndpoint.host: the VIP or first control plane IP. Written into - the CAPI Cluster object and into all generated machineconfigs via CABPT. -- controlPlaneEndpoint.port: Kubernetes API port. Default 6443. +Key spec fields: clusterRef, schedule, s3Destination, etcdBackupS3SecretRef. -Status fields: -- ready: bool. Set to true after all control plane SeamInfrastructureMachine - objects have status.ready=true. +Status fields: nextRunAt, lastRunAt, lastBackupName. --- -### TalosControlPlane +### NodeMaintenance (shortName: nm) -Scope: Namespaced - tenant-{cluster-name} -Short name: tcpl -API group: platform.ontai.dev +Targeted node-level operations that CAPI has no equivalent for. Applies to both management and target clusters via direct Conductor executor Job regardless of capi.enabled. -Dual-mode CRD. At compile time it serves as a command contract: Compiler reads -TalosControlPlane spec to generate management cluster bootstrap configuration -before any live cluster exists. At cluster runtime it is a live CR reconciled by -Platform. Carries the admin's complete control plane configuration intent. +Named Conductor capabilities: node-patch, hardening-apply, credential-rotate. -Must never be merged with TalosWorkerConfig - they evolve independently and a -combined CRD would risk CRD size limits. +Key spec fields: -Key spec fields: replicas, talosVersion, kubernetesVersion, machineConfigPatches, -hardeningProfileRef, endpointVIP, installerImage. +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster this operation targets. | +| operation | string (patch, hardening-apply, credential-rotate) | yes | Node-level operation to perform. | +| targetNodes | []string | no | Node names or IPs to target. All nodes when empty. | +| patchSecretRef | *SecretRef | no | Secret containing the machine config patch YAML. Required when operation=patch. | +| hardeningProfileRef | *LocalObjectRef | no | HardeningProfile CR to apply. Required when operation=hardening-apply. | +| rotateServiceAccountKeys | bool | no | Rotate service account signing keys. Applies when operation=credential-rotate. | +| rotateOIDCCredentials | bool | no | Rotate OIDC credentials. Applies when operation=credential-rotate. | -TalosCluster references TalosControlPlane by name in its spec. +Status condition types: Ready, Degraded. --- -### TalosWorkerConfig +### NodeOperation (shortName: nop) + +Node lifecycle operations. Dual-path CRD governed by capi.enabled on the owning TalosCluster. + +CAPI path (capi.enabled=true): modifies MachineDeployment replicas for scale-up, deletes specific Machine objects for decommission, or sets the Machine reboot annotation. All handled natively by CAPI. No Conductor Job submitted. + +Non-CAPI path (capi.enabled=false): submits node-scale-up, node-decommission, or node-reboot Conductor executor Job. -Scope: Namespaced - tenant-{cluster-name} -Short name: twc -API group: platform.ontai.dev +Named Conductor capabilities (non-CAPI path only): node-scale-up, node-decommission, node-reboot. -Dual-mode CRD. Same dual-mode pattern as TalosControlPlane - compile-time command -contract and live cluster CR. Carries worker node machine configuration intent per -pool. Must never be merged with TalosControlPlane. +Key spec fields: -Key spec fields: pools (each with name, replicas, machineConfigPatches, nodeLabels, -nodeTaints), talosVersion, installerImage, hardeningProfileRef. +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster this operation targets. | +| operation | string (scale-up, decommission, reboot) | yes | Node lifecycle operation to perform. | +| targetNodes | []string | no | Node names for decommission or reboot. Required when operation=decommission or reboot. | +| replicaCount | int32 | no | Desired worker replicas after scale-up. Required when operation=scale-up. | -TalosCluster references TalosWorkerConfig by name in its spec. +Status condition types: Ready, Degraded, CAPIDelegated. --- -**Dual-mode pattern:** Both TalosControlPlane and TalosWorkerConfig operate in two -modes. In compile mode, Compiler reads them as command contracts to generate bootstrap -artifacts before any cluster exists. In runtime mode, Platform reconciles them as live -CRs on running clusters, creating TalosConfigTemplate and CABPT objects from their -specs. +### PKIRotation (shortName: pkir) + +Cluster PKI certificate rotation. Always submits a direct Conductor executor Job via the pki-rotate named capability regardless of capi.enabled. CAPI has no PKI rotation equivalent. + +Named Conductor capability: pki-rotate. + +Key spec fields: clusterRef. + +Status fields: jobName, operationResult. +Status condition types: Ready, Degraded. + +PKI rotation automation: TalosCluster reconciler monitors pkiExpiryDate and auto-creates a PKIRotation CR when expiry is within pkiRotationThresholdDays days. On-demand rotation is triggered by applying the `platform.ontai.dev/rotate-pki=true` annotation to the TalosCluster CR. --- -## 5. CRDs - Platform-Owned +### ClusterReset (shortName: crst) -These CRDs are owned by Platform. They are not delegated to CAPI because CAPI has -no equivalent concept, or because they represent dual-path operations where the -management cluster path requires a direct conductor Job while CAPI handles the target -cluster path natively. +Destructive factory reset. HUMAN GATE REQUIRED: the `ontai.dev/reset-approved=true` annotation must be present before any reconciliation proceeds (CP-INV-006, INV-007). The reconciler holds at PendingApproval and emits an event if the annotation is absent. -### InfrastructureTalosCluster +CAPI path (capi.enabled=true): deletes CAPI Cluster object first, waits for all Machine objects to reach Deleted phase through the Seam Infrastructure Provider, then submits the cluster-reset Conductor Job. -Kind: InfrastructureTalosCluster. API group: infrastructure.ontai.dev/v1alpha1. Schema owned by seam-core (Decision G). Supersedes platform.ontai.dev/TalosCluster (Phase 2B, 2026-04-25). -Platform reconciles this type but does not own its CRD definition. Condition constants are imported from seam-core/pkg/conditions, not defined locally in platform. -Scope: Namespaced - seam-system (management), seam-tenant-{cluster-name} (target) -Short name: tc -Lives in: git and management cluster. - -The Seam root CR for every cluster. For target clusters, InfrastructureTalosCluster owns all -CAPI objects as children. For the management cluster, InfrastructureTalosCluster has no CAPI -children - it is the bootstrap record and operational anchor. - -spec.mode (v1alpha1 only): bootstrap or import. As before. - -Fields introduced with CAPI adoption: -- capi.enabled: bool. True for all target clusters. False for management cluster. - When true, the TalosCluster reconciler creates CAPI objects. When false, it - follows the direct bootstrap path. -- capi.talosVersion: Talos version to pass to TalosConfigTemplate and CABPT. -- capi.kubernetesVersion: Kubernetes version for TalosControlPlane. -- capi.controlPlane.replicas: number of control plane nodes. -- capi.workers: list of worker pools, each with a name, replica count, and - list of SeamInfrastructureMachine names pre-provisioned for that pool. -- capi.ciliumPackRef: the ClusterPack name and version for Cilium. Platform - triggers a PackExecution for this pack when the cluster reaches CAPI Running - state, before marking the cluster Ready. - -status.origin: bootstrapped or imported. Unchanged. -status.capiClusterRef: reference to the owned CAPI Cluster object. -Status conditions: Ready, Bootstrapping, Importing, Degraded, CiliumPending. - -CiliumPending is set when the cluster reaches CAPI Running state but the Cilium -ClusterPack has not yet reached PackInstance.Ready. Nodes are NotReady during -this window. This is expected and not a degraded state. +Non-CAPI path (capi.enabled=false): submits cluster-reset Conductor Job directly. + +Named Conductor capability: cluster-reset. + +Key spec fields: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster to reset. | +| drainGracePeriodSeconds | int32 | no | Seconds to wait for node drain before forcing the reset. Default 300. | +| wipeDisks | bool | no | Whether to call the Talos reset API with wipeDisks=true. Default false. | + +Status condition types: PendingApproval, Ready, Degraded. --- -### EtcdMaintenance +### ClusterMaintenance (shortName: cmaint) + +Maintenance window gate. Dual-path CRD governed by capi.enabled on the owning TalosCluster. -Scope: Namespaced - tenant-{cluster-name} -Short name: em -Named conductor capabilities: etcd-backup, etcd-restore, etcd-defrag +CAPI path (capi.enabled=true): sets `cluster.x-k8s.io/paused=true` on the CAPI Cluster when no active window exists and blockOutsideWindows=true. Pause halts all CAPI reconciliation until the window opens and the annotation is lifted. -Absorbs TalosBackup, TalosEtcdMaintenance, TalosRecovery. Covers all etcd -lifecycle operations for both management and target clusters. CAPI has no etcd -concept. Always a direct conductor (mode: execute) Job regardless of spec.capi.enabled on TalosCluster. +Non-CAPI path (capi.enabled=false): blocks Conductor Job admission gate for the cluster during restricted periods. + +No Conductor Job is submitted by this CRD. + +Key spec fields: -Key spec fields: clusterRef, operation (backup, restore, defrag), s3Destination (for -backup), s3SnapshotPath (for restore), targetNodes (for restore), schedule (for -recurring backup). +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster this maintenance gate controls. | +| windows | []MaintenanceWindow | no | Maintenance windows during which operations are permitted. | +| windows[].name | string | no | Optional label for this window. | +| windows[].start | string | yes | Window start time in cron format (e.g. "0 2 * * 6" for 02:00 every Saturday UTC). | +| windows[].durationMinutes | int32 | yes | Length of the maintenance window in minutes. | +| windows[].timezone | string | no | IANA timezone for interpreting the cron schedule. Default UTC. | +| blockOutsideWindows | bool | no | Block operations when no active window exists. Default false. | + +Status fields: activeWindowName. +Status condition types: Paused, WindowActive. --- -### NodeMaintenance +### UpgradePolicy (shortName: upgp) + +Governs Talos OS, Kubernetes, or combined stack upgrades. Dual-path CRD governed by capi.enabled on the owning TalosCluster. + +CAPI path (capi.enabled=true): updates TalosControlPlane version and MachineDeployment rolling upgrade settings natively through CAPI machinery. No Conductor Job submitted. + +Non-CAPI path (capi.enabled=false): submits talos-upgrade, kube-upgrade, or stack-upgrade Conductor executor Job. + +Named Conductor capabilities (non-CAPI path only): talos-upgrade, kube-upgrade, stack-upgrade. -Scope: Namespaced - tenant-{cluster-name} -Short name: nm -Named conductor capabilities: node-patch, hardening-apply, credential-rotate +Key spec fields: -Absorbs TalosNodePatch, TalosHardeningApply, TalosCredentialRotation. Covers -targeted node-level operations CAPI has no equivalent for. Applies to both -management and target clusters via direct conductor(mode: execute) Job regardless of spec.capi.enabled. +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster this upgrade targets. | +| upgradeType | string (talos, kubernetes, stack) | yes | Type of upgrade to perform. | +| targetTalosVersion | string | no | Target Talos version. Required when upgradeType=talos or stack. | +| targetKubernetesVersion | string | no | Target Kubernetes version. Required when upgradeType=kubernetes or stack. | +| rollingStrategy | string (sequential, parallel) | no | Order in which nodes are upgraded. Default sequential. | +| healthGateConditions | []string | no | Kubernetes condition types that must be True on each node before upgrade proceeds to the next node. | -Key spec fields: clusterRef, operation (patch, hardening-apply, credential-rotate), -targetNodes, patchSecretRef (for patch), hardeningProfileRef (for hardening-apply), -rotateServiceAccountKeys and rotateOIDCCredentials (for credential-rotate). +Status condition types: Ready, Degraded, CAPIDelegated. --- -### PKIRotation +### HardeningProfile (shortName: hp) -Scope: Namespaced - tenant-{cluster-name} -Short name: pkir -Named Conductor capability: pki-rotate +Reusable hardening ruleset. Configuration CR only. Does not directly trigger a Conductor Job. Jobs are submitted by NodeMaintenance (operation=hardening-apply) when it references this profile. Referenced by TalosCluster.spec.hardeningProfileRef for bootstrap-time hardening application. -Absorbs TalosPKIRotation. Single-purpose. Applies to both management and target -clusters via direct conductor(mode: execute) Job. CAPI has no PKI rotation equivalent. +Key spec fields: -Key spec fields: clusterRef. +| Field | Type | Description | +|-------|------|-------------| +| machineConfigPatches | []string | JSON Patch operations applied to the rendered machineconfig. | +| sysctlParams | map[string]string | Sysctl key/value pairs merged into the machineconfig sysctl section. | +| description | string | Human-readable description. | + +Status condition types: Valid. --- -### ClusterReset +### MaintenanceBundle (shortName: mb) + +Pre-compiled scheduling artifact produced by `compiler maintenance`. Carries pre-resolved scheduling context so neither Platform nor Conductor need to perform cluster queries at execution time. -Scope: Namespaced - tenant-{cluster-name} -Short name: crst -Named conductor capability: cluster-reset +The reconciler is a stub (F-P5 milestone). The type definition is delivered; reconciler implementation is deferred. -Absorbs TalosClusterReset. Destructive factory reset. Human gate required: -ontai.dev/reset-approved=true annotation must be present before any reconciliation -proceeds. +Named Conductor capabilities: drain, upgrade, etcd-backup, machineconfig-rotation. -For CAPI-managed clusters (spec.capi.enabled=true): deletes CAPI Cluster object -first, waits for all Machine objects to reach Deleted phase through the Seam -Infrastructure Provider, then submits cluster-reset conductor(mode: execute) Job. +Key spec fields: -For management cluster (spec.capi.enabled=false): submits cluster-reset Conductor(mode: execute) Job directly. +| Field | Type | Description | +|-------|------|-------------| +| clusterRef | LocalObjectRef | TalosCluster this bundle targets. | +| operation | string (drain, upgrade, etcd-backup, machineconfig-rotation) | Maintenance operation type. | +| maintenanceTargetNodes | []string | Pre-resolved list of target nodes, validated against the live cluster at compile time. | +| operatorLeaderNode | string | Node hosting the platform operator leader pod at compile time. | +| s3ConfigSecretRef | *corev1.SecretReference | Pre-resolved S3 configuration Secret. Never absent when the operation requires it. | -Key spec fields: clusterRef, drainGracePeriodSeconds, wipeDisks. +Status condition types: Ready, Pending, Degraded. --- -### HardeningProfile +### TalosMachineConfigBackup (shortName: mcb) -Scope: Namespaced -Short name: hp +Triggers a machine config backup for all nodes of a cluster. The Conductor executor reads each node's running config via GetMachineConfig and uploads it to S3 at `{cluster}/machineconfigs/{TIMESTAMP}/{hostname}.yaml`. -Absorbs TalosHardeningProfile. Configuration CR only - not an operational Job -trigger. Reusable hardening ruleset referenced by NodeMaintenance at runtime and -by TalosControlPlane and TalosWorkerConfig at compile time. +Named Conductor capability: machineconfig-backup. -Key spec fields: machineConfigPatches, sysctlParams, description. +Key spec fields: clusterRef, s3BackupSecretRef, s3Destination. ---- +Status condition types: Ready, Running, Degraded, S3DestinationAbsent. -### UpgradePolicy +--- -Scope: Namespaced - tenant-{cluster-name} -Short name: upgp +### TalosMachineConfigBackupSchedule (shortName: mcbs) -Absorbs TalosUpgrade, TalosKubeUpgrade, TalosStackUpgrade. Dual-path CRD governed -by spec.capi.enabled on the owning TalosCluster. +Schedule controller. Creates TalosMachineConfigBackup CRs on a repeating interval. The schedule field accepts Go duration strings (e.g. "24h"). -For CAPI-managed clusters (spec.capi.enabled=true): updates TalosControlPlane -version field and MachineDeployment rolling upgrade settings natively through CAPI -machinery - no conductor(mode: execute) Job submitted. +No Conductor Job submitted directly. All actual work is delegated to the TalosMachineConfigBackup CRs this controller creates. -For management cluster (spec.capi.enabled=false): submits talos-upgrade, kube-upgrade, -or stack-upgrade conductor(mode: execute) Job via OperationalJobReconciler routing. +Key spec fields: clusterRef, schedule, s3Destination, s3BackupSecretRef. -Key spec fields: clusterRef, upgradeType (talos, kubernetes, stack), -targetTalosVersion, targetKubernetesVersion, rollingStrategy, healthGateConditions. +Status fields: nextRunAt, lastRunAt, lastBackupName. --- -### NodeOperation +### TalosMachineConfigRestore (shortName: mcr) -Scope: Namespaced - tenant-{cluster-name} -Short name: nop +Triggers a machine config restore for target nodes of a cluster. The Conductor executor downloads each node's config from S3 at `{cluster}/machineconfigs/{backupTimestamp}/{hostname}.yaml` and applies it via ApplyConfiguration. -Absorbs TalosNodeScaleUp, TalosNodeDecommission, TalosReboot. Dual-path CRD -governed by spec.capi.enabled on the owning TalosCluster. +Named Conductor capability: machineconfig-restore. -For CAPI-managed clusters (spec.capi.enabled=true): modifies MachineDeployment -replicas for scale-up, deletes specific Machine objects for decommission, or sets -Machine reboot annotation - all handled natively by CAPI. +Key spec fields: -For management cluster (spec.capi.enabled=false): submits node-scale-up, -node-decommission, or node-reboot conductor(mode: execute) Job. +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| clusterRef | LocalObjectRef | yes | TalosCluster whose nodes will be restored. | +| backupTimestamp | string | yes | Timestamp of the backup to restore from. Format: 20060102T150405Z (UTC). Must match the timestamp component in the S3 path written by a prior machineconfig-backup. | +| targetNodes | []string | no | Node hostnames to restore. All nodes when empty. | +| s3SourceBucket | string | yes | S3 bucket containing the backup objects. | +| s3BackupSecretRef | *corev1.SecretReference | no | S3 credentials Secret. Falls back to seam-etcd-backup-config in seam-system. | -Key spec fields: clusterRef, operation (scale-up, decommission, reboot), -targetNodes, replicaCount (for scale-up). +Status fields: phase (Pending, Running, Succeeded, Failed, PartiallyFailed), restoredNodes. +Status condition types: Ready, Running, Degraded, S3SourceAbsent. --- -### ClusterMaintenance +## 6. CAPI Integration Model -Scope: Namespaced - tenant-{cluster-name} -Short name: cmaint +### Seam Infrastructure Provider -Absorbs TalosNoMaintenance. Maintenance window gate. +The Seam Infrastructure Provider is a purpose-built platform component that implements the CAPI InfrastructureCluster and InfrastructureMachine contracts. It does not call any cloud API. It watches SeamInfrastructureCluster and SeamInfrastructureMachine objects and delivers machineconfigs to pre-provisioned Talos nodes on port 50000 using the talos goclient. -For CAPI-managed clusters (spec.capi.enabled=true): sets cluster.x-k8s.io/paused=true -on the CAPI Cluster object when no active window exists and blockOutsideWindows=true. -Pause halts all CAPI reconciliation until the window opens and the annotation is lifted. +The talos goclient is restricted to SeamInfrastructureClusterReconciler and SeamInfrastructureMachineReconciler only (CP-INV-001). All other reconcilers observe cluster state through CAPI Machine status conditions and Kubernetes node labels only (CP-INV-002). -For management cluster (spec.capi.enabled=false): blocks conductor(mode: execute) Job admission gate for -the cluster during restricted periods. +### CAPI object ownership -Key spec fields: clusterRef, windows, blockOutsideWindows. +Platform's TalosCluster reconciler creates and owns: ---- +- SeamInfrastructureCluster (infrastructure reference for the CAPI Cluster) +- cluster.x-k8s.io/Cluster (owns TalosControlPlane and MachineDeployments) +- TalosControlPlane (CABPT control plane management) +- MachineDeployment per node role +- TalosConfigTemplate (CABPT machineconfig generation template) +- SeamInfrastructureMachineTemplate (template for SeamInfrastructureMachine per node) + +All created in seam-tenant-{cluster-name} and owned by TalosCluster via ownerReference (CP-INV-008). + +### Cilium CNI integration + +Every TalosConfigTemplate created by platform includes `cluster.network.cni.name: none` and Cilium BPF kernel parameters (CP-INV-009). After CAPI bootstraps the cluster, platform triggers a PackExecution for the Cilium PackDelivery referenced by spec.capi.ciliumPackRef. Nodes transition to Ready only after Cilium is up. + +CiliumPending on TalosCluster is not a degraded state (CP-INV-013). It is the expected state between CAPI cluster Running and Cilium PackInstance Ready. -## 6. Tenant Coordination CRDs +### SeamInfrastructureCluster fields -### PlatformTenant, QueueProfile +Cluster-level CAPI infrastructure reference. One per cluster. -PlatformTenant and QueueProfile semantics, namespace placement, and gate conditions -are unchanged. ClusterAssignment has been removed -- it was a pre-seam binding record -with no role in the current seam operator family. Cilium bootstrap is now triggered -directly by Platform via spec.capi.ciliumPackRef when the CAPI Cluster reaches -Running state. +Key spec fields: controlPlaneEndpoint.host (VIP or first control plane IP), controlPlaneEndpoint.port (default 6443). -QueueProfile is scoped to Wrapper's quota profile only. The ClusterQueue and -ResourceFlavor resources provisioned by Guardian from QueueProfile govern -pack-deploy Job admission - cluster lifecycle operations no longer go through Kueue. +Status: ready=true after all control plane SeamInfrastructureMachine objects have status.ready=true. -**LicenseKey has been removed.** Seam has no licensing tier, no JWT enforcement, -and no cluster count limits. +### SeamInfrastructureMachine fields + +Per-node CAPI infrastructure reference. One per node. + +Key spec fields: address (pre-provisioned node IP reachable on port 50000), port (default 50000), talosConfigSecretRef, nodeRole (controlplane or worker). + +Status fields: ready (true after machineconfig applied and node exits maintenance mode), machineConfigApplied, providerID (format: talos://{cluster-name}/{node-ip}). --- -## 7. Kueue Scope +## 7. Tenant Namespace Model + +Platform is the sole namespace creation authority for seam-tenant-{cluster-name} namespaces (CP-INV-004). No other operator or component creates these namespaces. + +Namespace provisioning by mode: -Kueue remains a management cluster prerequisite exclusively because Wrapper's -pack-deploy Jobs require it. The ClusterQueue and ResourceFlavor resources -provisioned by Guardian from QueueProfile govern pack-deploy Job admission. +- mode=bootstrap and capi.enabled=true: Platform creates the namespace in the reconcile path. No bootstrap bundle assist needed. +- mode=import: Platform creates the namespace as part of the two-site onboarding sequence. The namespace creation is idempotent. The Compiler bootstrap bundle for import clusters includes a seam-tenant-namespace.yaml manifest so the admin can apply Secrets and the TalosCluster CR in a single kubectl apply run. Platform's ensureTenantNamespace call in the import reconcile path is an idempotent safety net. -Cluster lifecycle operations (bootstrap, upgrade, scale, decommission) do not use -Kueue. They are reconciled by CAPI controllers directly. The observability -previously provided by Kueue Jobs is now provided by CAPI Cluster and Machine -status conditions and events. +When the ClusterLog CR is created, it is placed in seam-tenant-{cluster-name} and named by the cluster name. -Operational Jobs (etcd-backup, etcd-maintenance, pki-rotate, etcd-restore, -hardening-apply, node-patch, credential-rotate, cluster-reset) submit directly to -the default JobQueue without Kueue admission control. They are targeted, infrequent, -and operator-gated operations that do not require Kueue's quota and scheduling machinery. +MachineConfig Secrets for native and imported clusters follow the naming convention `seam-mc-{cluster-name}-{node-name}` in seam-tenant-{cluster-name}. Platform is the sole owner of these Secrets. No other operator or Conductor capability handler may modify them. --- -## 8. CAPI RBAC and Guardian +## 8. Etcd Backup S3 Resolution -CAPI installs substantial RBAC: ClusterRoles and ClusterRoleBindings for each -provider controller, ServiceAccounts, and webhook configurations. All of this -must pass through Guardian's third-party RBAC intake protocol before CAPI -controllers start. +Platform resolves the S3 backup destination at RunnerConfig creation time. Conductor and the etcd-backup Job receive the resolved Secret reference via RunnerConfig and perform no S3 resolution themselves. -The enable phase order is: -1. Guardian (CRD-only phase, webhook operational) -2. cert-manager (RBAC via Guardian intake) -3. Kueue (RBAC via Guardian intake) -4. CNPG (RBAC via Guardian intake, Guardian transitions to phase 2) -5. CAPI core (RBAC via Guardian intake) -6. CABPT (RBAC via Guardian intake) -7. metallb (RBAC via Guardian intake) -8. local-path-provisioner (RBAC via Guardian intake) -9. Platform (RBACProfile provisioned by Guardian, then controller starts) -10. Wrapper (RBACProfile provisioned, then controller starts) +Resolution order: -No CAPI component starts until Guardian has processed its RBACProfile and -set provisioned=true. +1. Explicit reference on the EtcdMaintenance CR (spec.etcdBackupS3SecretRef): if present, use this Secret. +2. Platform-wide default Secret (seam-etcd-backup-config in seam-system): if no explicit reference is present and this Secret exists, use it. +3. Absent condition: if neither exists, platform sets EtcdBackupDestinationAbsent on the EtcdMaintenance CR with status=True and does not emit a RunnerConfig. Silent failure is never permitted. + +Local PVC fallback: permitted only as a visible degraded mode when spec.pvcFallbackEnabled=true. Platform sets EtcdBackupLocalFallback condition with status=True and the CR status explicitly states the backup is non-durable. + +S3 path structure within the bucket: `etcd-backup/{cluster-uid}/` where cluster-uid is the TalosCluster UID. UIDs are immutable and globally unique across clusters. + +S3 Secret key contract: Both MinIO/Scality camelCase keys (accessKeyID, secretAccessKey, region, endpoint) and AWS SDK env var names (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT) are accepted. The reconciler normalizes to AWS SDK env var form and writes a projected Secret named `{em.Name}-s3-env` in em.Namespace owned by the EtcdMaintenance CR. The executor Job mounts this projected Secret via envFrom. Cross-namespace secret projection: source Secret may reside in seam-system while the executor Job runs in seam-tenant-{cluster}. --- -## 9. MachineConfig Storage Contract - -**LOCKED INVARIANT - Platform Governor directive 2026-04-05.** - -For native and imported Seam clusters (`spec.capi.enabled=false`), Platform operator -is the sole owner of machineconfig generation and storage. This applies to the -management cluster and to any cluster onboarded via the import path. - -**Secret naming convention:** -Machineconfigs are stored as Kubernetes Secrets in the management cluster, one Secret -per node, using the naming convention: - - seam-mc-{cluster-name}-{node-name} - -in the `seam-tenant-{cluster-name}` namespace. - -**Provisioning by mode:** - -`spec.mode=bootstrap`: Platform generates machineconfig Secrets from `InfrastructureTalosCluster` -spec at bootstrap time. The Compiler emits only the `InfrastructureTalosCluster` CR; it does -not emit machineconfig Secrets for bootstrap clusters. Platform applies `HardeningProfile` patches -on top of the generated base config when `spec.hardeningProfileRef` is set (PLATFORM-BL-HARDENINGPROFILE-MERGE, -pending schema amendment to add node topology fields to `InfrastructureTalosClusterSpec`). -Until that schema amendment lands, the Compiler bootstrap subcommand continues to emit machineconfig -Secrets for management-cluster bootstrap to preserve the existing bootstrap Job path. - -`spec.mode=import`: Platform captures machineconfig Secrets from the running cluster via the Talos -COSI API (`/system/state/config.yaml`) immediately after the kubeconfig Secret is generated. Platform -uses the talosconfig Secret (compiler-emitted, admin-applied before the TalosCluster CR) to authenticate, -lists nodes from the running cluster via kubeconfig, and reads the machineconfig from each node. -The Compiler emits only the `InfrastructureTalosCluster` CR and the talosconfig Secret for import clusters. -It does not emit machineconfig Secrets for import clusters. (PLATFORM-BL-MACHINECONFIG-IMPORT-CAPTURE tracks -the platform-side implementation.) - -**Lifecycle:** -After initial capture (import mode) or generation (bootstrap mode), Platform reconciles -these Secrets when node configuration changes -- for example when a HardeningProfile is -updated or a machineconfig patch is applied via NodeMaintenance. - -**Namespace authority:** -CP-INV-004: Platform is the sole namespace creation authority for `seam-tenant-{cluster-name}` -for bootstrap and CAPI-managed cluster modes. For mode=import, the Compiler bootstrap bundle -includes a `seam-tenant-namespace.yaml` manifest so the admin can apply the namespace (and -Secrets that live in it) before the TalosCluster CR in a single `kubectl apply -f` run. -Platform's `ensureTenantNamespace` call in the import reconcile path is idempotent -- it -creates the namespace if absent (handles re-reconcile or manual deletion) but does not race -with the bootstrap bundle application. For mode=bootstrap and CAPI: Platform creates the -namespace in the reconcile path with no bootstrap bundle assist needed. - -**Design rationale:** -This mirrors the CAPI bootstrap provider secret pattern intentionally. The CAPI path -stores machineconfigs as bootstrap data Secrets managed by CABPT. The native path -stores them as Seam-named Secrets managed by Platform. The operational model is -consistent regardless of provisioning path: a named Secret per node holds the node's -current authoritative machineconfig. - -No other operator or Conductor capability handler owns these Secrets. -A machineconfig Secret owned by Platform must never be modified by any other component. -This invariant has no exceptions and requires a Platform Governor constitutional -amendment to change. +## 9. Conductor Capability Dispatch + +Platform generates a RunnerConfig using the shared runner library (CP-INV-003) and submits a Conductor executor Job. The RunnerConfig targets a named capability. The mapping from CRD to capability is: + +| CRD | Operation | Conductor capability | +|-----|-----------|---------------------| +| EtcdMaintenance | backup | etcd-backup | +| EtcdMaintenance | restore | etcd-restore | +| EtcdMaintenance | defrag | etcd-defrag | +| NodeMaintenance | patch | node-patch | +| NodeMaintenance | hardening-apply | hardening-apply | +| NodeMaintenance | credential-rotate | credential-rotate | +| NodeOperation | scale-up | node-scale-up (non-CAPI path) | +| NodeOperation | decommission | node-decommission (non-CAPI path) | +| NodeOperation | reboot | node-reboot (non-CAPI path) | +| PKIRotation | (any) | pki-rotate | +| ClusterReset | (any) | cluster-reset | +| UpgradePolicy | talos | talos-upgrade (non-CAPI path) | +| UpgradePolicy | kubernetes | kube-upgrade (non-CAPI path) | +| UpgradePolicy | stack | stack-upgrade (non-CAPI path) | +| TalosMachineConfigBackup | (any) | machineconfig-backup | +| TalosMachineConfigRestore | (any) | machineconfig-restore | +| MaintenanceBundle | drain | drain | +| MaintenanceBundle | upgrade | upgrade | +| MaintenanceBundle | etcd-backup | etcd-backup | +| MaintenanceBundle | machineconfig-rotation | machineconfig-rotation | + +Dual-path CRDs (UpgradePolicy, NodeOperation, ClusterMaintenance) do NOT submit a Conductor Job on the CAPI path. Platform must check capi.enabled on the owning TalosCluster before deciding which path to take. + +Kueue is not used for any platform Job submission (CP-INV-010). --- -## 10. Etcd Backup Destination Contract - -**LOCKED INVARIANT - Platform Governor directive 2026-04-05.** - -Platform operator resolves the S3 backup destination at RunnerConfig creation time - -never deferred to Conductor or the Job. Resolution is deterministic, ordered, and -fails fast with a structured condition rather than silently proceeding without a -destination. - -**Resolution order (evaluated at RunnerConfig creation time):** - -1. **Explicit reference on TalosCluster CR**: if the TalosCluster spec carries an - explicit S3 config Secret reference (`spec.etcdBackupS3SecretRef`), Platform uses - that Secret. No further lookup is performed. - -2. **Platform-wide default Secret**: if no explicit reference is present, Platform - looks for a Secret named `seam-etcd-backup-config` in the `seam-system` namespace. - If found, it is used as the S3 configuration source. - -3. **Absent condition**: if neither the explicit reference nor the platform-wide - default Secret exists, Platform sets the condition `EtcdBackupDestinationAbsent` - on the EtcdMaintenance CR with `status=True` and does not emit a RunnerConfig. - The EtcdMaintenance CR remains in a pending state until a valid Secret is provided. - Silent failure is never permitted - the condition must always be set and observable. - -**Local PVC fallback (non-durable degraded mode):** -A local PVC fallback is permitted as a last-resort, non-durable mode only. If the -operator configuration explicitly enables PVC fallback, Platform sets the condition -`EtcdBackupLocalFallback` on the EtcdMaintenance CR with `status=True`. The CR status -must explicitly state: "Backup is non-durable - PVC-backed storage does not survive -node failure or cluster destruction." PVC fallback is not a substitute for S3. It is -a visible degraded mode, not a transparent default. - -**S3 path structure within the bucket:** - - etcd-backup/{cluster-id}/ - -where `{cluster-id}` is the TalosCluster UID, not the name. UIDs are immutable and -globally unique across clusters. This ensures backup paths survive cluster rename and -remain unambiguous when multiple clusters write to the same bucket. - -**Invariant boundary:** -Conductor and the etcd-backup Job receive the resolved Secret reference via RunnerConfig. -They perform no S3 destination resolution themselves. A Conductor execute-mode Job that -independently resolves an S3 destination is an invariant violation. - -**S3 secret key contract (admin responsibility):** - -The admin creates the S3 credentials Secret before any EtcdMaintenance CR is submitted. -The Secret may use either of two key naming conventions; both are accepted and normalized -by the Platform reconciler before the executor Job is created: - -| Provider style | Key name | Normalized to | -|---|---|---| -| MinIO / Scality (camelCase) | `accessKeyID` | `AWS_ACCESS_KEY_ID` | -| MinIO / Scality (camelCase) | `secretAccessKey` | `AWS_SECRET_ACCESS_KEY` | -| MinIO / Scality (camelCase) | `region` | `S3_REGION` | -| MinIO / Scality (camelCase) | `endpoint` | `S3_ENDPOINT` (optional) | -| AWS SDK env var | `AWS_ACCESS_KEY_ID` | `AWS_ACCESS_KEY_ID` | -| AWS SDK env var | `AWS_SECRET_ACCESS_KEY` | `AWS_SECRET_ACCESS_KEY` | -| AWS SDK env var | `S3_REGION` | `S3_REGION` | -| AWS SDK env var | `S3_ENDPOINT` | `S3_ENDPOINT` (optional) | - -`accessKeyID`, `secretAccessKey`, and `region` (or their AWS SDK equivalents) are -required. `endpoint` / `S3_ENDPOINT` is optional and must be omitted for native AWS S3. -If any required key is absent, reconcile halts with `EtcdBackupDestinationAbsent`. - -**Cross-namespace secret projection:** - -The source Secret may reside in `seam-system` while the executor Job runs in -`seam-tenant-{cluster}`. Kubernetes does not permit `envFrom` across namespaces. -The reconciler reads the source Secret, normalizes its keys to the canonical AWS SDK -env var names listed above, and writes a projected copy named `{em.Name}-s3-env` -into `em.Namespace`. The projected Secret carries an ownerReference to the -EtcdMaintenance CR and is garbage-collected automatically when the CR is deleted. -The executor Job mounts the projected Secret via `envFrom` so the Conductor binary -reads `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `S3_REGION`, and optionally -`S3_ENDPOINT` from its environment. +## 10. RunnerConfig Generation Protocol + +RunnerConfig is generated by platform using the shared runner library for all operational Job CRDs. It is never hand-coded (CP-INV-003). It is not generated for CAPI-managed lifecycle operations. + +The RunnerConfig carries the pre-resolved capability name, S3 configuration (for operations that require it), target cluster kubeconfig and talosconfig Secret references, target node list, operator leader node, and any operation-specific parameters. + +Platform reads the RunnerConfig.agentImage field to determine the Conductor image tag to use for the executor Job. The Conductor image tag must match the cluster's Talos version (INV-012). + +For import-mode clusters, Platform drives a two-site onboarding sequence that includes deploying Conductor agent mode (role=tenant) in ont-system on the tenant cluster. See guardian-schema.md for the full handshake protocol. --- -## 11. Cross-Domain Rules +## 11. Conductor Deployment Contract + +Platform is exclusively responsible for deploying Conductor agent mode onto every tenant cluster it forms. This happens after TalosCluster formation reaches the readiness threshold and before marking the cluster fully Ready. -Reads: security.ontai.dev/RBACProfile status (gate check). -Reads: infrastructure.ontai.dev/InfrastructureClusterPack (validate Cilium pack reference in InfrastructureTalosCluster). -Reads: infrastructure.ontai.dev/InfrastructurePackInstance (gate Cilium PackExecution on Ready). -Owns: cluster.x-k8s.io/Cluster and all CAPI child objects for target clusters. -Owns: SeamInfrastructureCluster, SeamInfrastructureMachine in tenant namespaces. -Creates: tenant namespaces - sole authority. -Never writes to security.ontai.dev or infrastructure.ontai.dev CRDs outside InfrastructureTalosCluster and InfrastructureRunnerConfig. +- Platform creates exactly one Conductor Deployment per tenant cluster, in ont-system on that cluster. +- The Deployment must carry role=tenant as a first-class field. An absent or incorrect role causes Conductor to exit with InvariantViolation. +- Platform does not deploy Conductor to the management cluster. `compiler enable` is the sole authority for the management cluster Conductor Deployment (role=management). +- If the Conductor Deployment is deleted from a tenant cluster's ont-system, Platform must recreate it on the next TalosClusterReconciler reconcile cycle. +- The Conductor image tag must match RunnerConfig.agentImage for this cluster. --- -## 12. Conductor Deployment Contract - -**LOCKED INVARIANT - Platform Governor directive 2026-04-05.** - -Platform operator is responsible for deploying Conductor agent mode onto every tenant -cluster it forms, as part of cluster formation reconciliation. This responsibility is -exclusive - no other component deploys Conductor to tenant clusters. - -**When Platform deploys Conductor to a tenant cluster:** -After TalosCluster formation reaches the readiness threshold and before marking the -cluster fully Ready, Platform's TalosClusterReconciler creates a Conductor agent -Deployment in ont-system on the target cluster. This Deployment is constructed using -the target cluster's kubeconfig mounted from the tenant's kubeconfig Secret. - -**Role stamp requirement:** -The Conductor Deployment created by Platform for any tenant cluster **must** carry -`role=tenant` as a first-class field on the Deployment. This is not an annotation, -not an environment variable, and not a label. It is a named field. - -Conductor reads this field at startup to determine which loops to activate. An absent -or incorrect role causes Conductor to exit with InvariantViolation. Platform is -solely responsible for correct role stamping. See conductor-schema.md §15. - -**Invariants:** -- Platform creates exactly one Conductor Deployment per tenant cluster, in ont-system - on that cluster, using the cluster's kubeconfig Secret. -- The Deployment is created with role=tenant. Any other value is a programming error. -- Platform does not deploy Conductor to the management cluster. `compiler enable` - is the sole authority for the management cluster Conductor Deployment (role=management). -- If the Conductor Deployment is deleted from a tenant cluster's ont-system, Platform - must recreate it on the next TalosClusterReconciler reconcile cycle. -- The Conductor image tag used must match the RunnerConfig.agentImage for this cluster. - Platform reads agentImage from RunnerConfig before creating the Deployment. - -**Import-mode cluster specifics:** -For clusters with `spec.mode: import`, Platform drives an additional two-site onboarding -sequence beyond the Conductor Deployment. The complete sequence is specified in -guardian-schema.md §20 and includes: - -1. Create seam-tenant-{clusterName} namespace on the management cluster (CP-INV-004). -2. Store tenant kubeconfig Secret in seam-tenant-{clusterName}. -3. Create ont-system namespace on the tenant cluster. -4. Create conductor ServiceAccount in ont-system on the tenant cluster. -5. Create conductor Deployment (role=tenant) in ont-system on the tenant cluster. -6. Create conductor RBACProfile in ont-system on the tenant cluster (Seam operator - profile, rbacPolicyRef: management-policy, permissionSetRef: management-maximum). -7. Observe PermissionSnapshotReceipt acknowledgement from the management conductor - (written to InfrastructureTalosCluster.status.conductorHandshake). -8. Advance InfrastructureTalosCluster.status.phase to Operational on acknowledgement. - -Platform sets InfrastructureTalosCluster.status.phase: ConductorPending when the -Deployment is created. Phase does not advance until the gRPC handshake completes. -See guardian-schema.md §20 for the full handshake protocol and PermissionSnapshot -delivery sequence. +## 12. MachineConfig Storage Contract + +For native and imported clusters (capi.enabled=false), Platform is the sole owner of machineconfig generation and storage. + +Naming convention: `seam-mc-{cluster-name}-{node-name}` in seam-tenant-{cluster-name}. + +mode=bootstrap: Platform generates machineconfig Secrets from TalosClusterSpec at bootstrap time. Platform applies HardeningProfile patches on top of the base config when spec.hardeningProfileRef is set. + +mode=import: Platform captures machineconfig Secrets from the running cluster via the Talos COSI API (/system/state/config.yaml) immediately after kubeconfig Secret generation. Platform uses the talosconfig Secret to authenticate, lists nodes via kubeconfig, and reads the machineconfig from each node. + +No other operator or Conductor capability handler owns these Secrets. A machineconfig Secret owned by Platform must never be modified by any other component. --- ## 13. PKI Rotation Contract -**PKI rotation automation -- session/17, 2026-05-02.** +Imported Talos clusters carry two sets of short-lived certificates stored in Secrets: admin kubeconfig (Kubernetes client cert) and talosconfig client cert. + +Spec fields on TalosCluster: pkiRotationThresholdDays (int32, default 30, minimum 1). + +Status fields on TalosCluster: pkiExpiryDate (*metav1.Time) -- earliest certificate expiry across both Secrets. + +Auto-rotation: when pkiExpiryDate is within pkiRotationThresholdDays days, the reconciler creates a PKIRotation CR with label pki-trigger=auto. Idempotent: skips if a PKIRotation CR already exists for this cluster and is not yet complete or failed. + +On-demand rotation: annotation `platform.ontai.dev/rotate-pki=true` on TalosCluster. Reconciler creates a PKIRotation CR with label pki-trigger=manual, then clears the annotation via Patch. -Imported Talos clusters carry two sets of short-lived certificates stored in Secrets: -- Admin kubeconfig (Kubernetes client cert, ~1 year TTL): `seam-mc-{cluster}-kubeconfig` in `seam-tenant-{cluster}`, key `value`. -- Talosconfig client cert: `seam-mc-{cluster}-talosconfig` in `seam-tenant-{cluster}`, key `talosconfig`. +PKI expiry check runs only for stable-Ready clusters. Stable-Ready clusters are requeued every 24 hours for daily expiry monitoring. -When these expire, the platform operator and Conductor executor lose the ability to connect to the cluster. +--- + +## 14. Cross-Domain Rules + +Platform reads: guardian.ontai.dev/RBACProfile status (gate check before starting). +Platform reads: dispatcher PackInstalled status (gate Cilium PackExecution on Ready). +Platform owns: cluster.x-k8s.io/Cluster and all CAPI child objects for target clusters. +Platform owns: SeamInfrastructureCluster, SeamInfrastructureMachine in tenant namespaces. +Platform creates: seam-tenant-{cluster-name} namespaces (sole authority, CP-INV-004). +Platform never writes to guardian.ontai.dev CRDs. +Platform never writes to dispatcher.ontai.dev CRDs. + +--- -**Spec fields (InfrastructureTalosCluster, seam-core):** -- `spec.pkiRotationThresholdDays` (int32, default 30, minimum 1): days before cert expiry to auto-trigger PKI rotation. +## 15. Decision Records -**Status fields (InfrastructureTalosCluster, seam-core):** -- `status.pkiExpiryDate` (*metav1.Time): earliest certificate expiry across both Secrets. Written by TalosCluster reconciler. +**Decision H -- TalosCluster is the Seam root CR.** All CAPI objects for target clusters are children of TalosCluster via ownerReference. CAPI objects do not exist without a TalosCluster parent. -**Triggers:** -1. Annotation `platform.ontai.dev/rotate-pki=true` on InfrastructureTalosCluster: on-demand rotation. The reconciler creates a PKIRotation CR with label `pki-trigger=manual` in `seam-tenant-{cluster}`, then clears the annotation via Patch. -2. Auto-rotation: when `status.pkiExpiryDate` is within `spec.pkiRotationThresholdDays` days of the current time, the reconciler creates a PKIRotation CR with label `pki-trigger=auto`. Idempotent: skips if a PKIRotation CR for this cluster already exists and is not yet complete or failed. +**Decision I -- Deletion of TalosCluster never destroys a cluster.** Kubernetes garbage collection cascades to owned CAPI objects, which triggers CAPI's own deletion reconciliation, but this does not factory reset nodes. ClusterReset is the only destruction path (INV-015). -**Reconcile loop integration:** -PKI expiry check runs in Step F of `Reconcile()` only for stable-Ready clusters (clusters that had `Ready=True` before the current reconcile pass). Step F does NOT run during the first-pass Ready transition to avoid overriding the clean result returned by routing functions. +**Decision J -- CiliumPending is not degraded.** The window between CAPI cluster Running and Cilium PackInstance Ready is expected. Nodes are NotReady during this window. The MachineHealthCheck tolerance window must be configured to avoid spurious remediation during Cilium installation (CP-INV-013). -Stable-Ready clusters are requeued every 24 hours for daily expiry monitoring. +**Decision K -- Kueue is not used for any platform operation.** Operational runner Jobs submit directly. Kueue governs dispatcher pack-deploy Jobs exclusively. This applies permanently; the decision is locked (CP-INV-010). -**Conductor execute-mode behavior (pkiRotateHandler):** -After the staged machine config apply succeeds, `pkiRotateHandler.Execute()` calls `TalosClient.Kubeconfig()` to generate a fresh kubeconfig and writes it to both `seam-mc-{cluster}-kubeconfig` and `target-cluster-kubeconfig` in `seam-tenant-{cluster}` via the dynamic client. Kubeconfig refresh is best-effort: if it fails, the operation result is still `Succeeded` because the staged config apply is the critical step. The failure is recorded in the step results with a note. +**Decision L -- S3 destination is resolved at RunnerConfig creation time.** Conductor never performs S3 resolution. A Conductor execute-mode Job that independently resolves an S3 destination is an invariant violation. -**Implementation files:** -- `platform/internal/controller/pki_cert_helpers.go`: cert expiry detection, Secret reading, PKIRotation CR creation. -- `conductor/internal/capability/platform_security.go`: `pkiRotateHandler.Execute()` with kubeconfig refresh. -- `conductor/internal/capability/clients.go`: `TalosNodeClient.Kubeconfig()` interface method. -- `conductor/internal/capability/adapters.go`: `TalosClientAdapter.Kubeconfig()` adapter. +**Decision M -- Platform is the sole Conductor deployer for tenant clusters.** No other component deploys Conductor to tenant clusters. Role must be stamped as role=tenant. Incorrect or absent role is an InvariantViolation (platform-schema.md §11). --- -*platform.ontai.dev schema - Platform* -*Amendments:* -*2026-03-30 - CAPI adopted for target cluster lifecycle. Seam Infrastructure Provider* -* introduced. SeamInfrastructureMachine and SeamInfrastructureCluster CRDs added.* -* TalosUpgrade, TalosKubeUpgrade, TalosStackUpgrade, TalosNodeScaleUp,* -* TalosNodeDecommission, TalosReboot replaced by CAPI equivalents.* -* Kueue scoped to Wrapper pack-deploy Jobs only.* -* TalosNoMaintenance integrated with CAPI pause mechanism.* -* Cilium CNI integration documented. CiliumPending condition added to TalosCluster.* -* Management cluster bootstrap unchanged - CAPI not applicable.* - -*2026-03-30 - Section 6 retitled "CRDs Delegated to CAPI for Target Clusters"* -* (Path B ruling). Six lifecycle CRDs retained with dual-path semantics:* -* CAPI-native for spec.capi.enabled=true (target clusters), direct conductor(mode: execute) Job via* -* OperationalJobReconciler for spec.capi.enabled=false (management cluster).* -* Named conductor capability references restored for all six entries.* - -*2026-04-03 - Operator rename: Platform (formerly platform), Guardian (formerly* -* guardian), Wrapper (formerly wrapper), Conductor [Compiler, Conductor (formerly conductor).*] -* CAPI infrastructure CRDs renamed: SeamInfrastructureCluster (formerly* -* SeamInfrastructureCluster), SeamInfrastructureMachine (formerly* -* SeamInfrastructureMachine). API group infrastructure.cluster.x-k8s.io unchanged.* -* TalosControlPlane and TalosWorkerConfig added as dual-mode CRDs with explicit* -* compile-time and runtime semantics documented. Sixteen day-two CRDs consolidated* -* into eight: EtcdMaintenance, NodeMaintenance, PKIRotation, ClusterReset,* -* HardeningProfile, UpgradePolicy, NodeOperation, ClusterMaintenance. LicenseKey* -* removed - Seam is fully open source with no licensing tier.* - -*2026-04-05 - Section 9 "MachineConfig Storage Contract" added: locked invariant.* -* Platform is sole owner of machineconfig Secrets for native/imported clusters.* -* Naming convention seam-mc-{cluster-name}-{node-name} in seam-tenant-{cluster-name}.* -* Mirrors CAPI bootstrap provider pattern. No other component may modify these Secrets.* -* Section 10 "Etcd Backup Destination Contract" added: locked invariant.* -* S3 resolution hierarchy: explicit TalosCluster ref → seam-etcd-backup-config in* -* seam-system → EtcdBackupDestinationAbsent condition (no RunnerConfig emitted).* -* Local PVC fallback permitted only as visible degraded mode (EtcdBackupLocalFallback* -* condition, non-durable status explicit). S3 path: etcd-backup/{cluster-uid}/.* -* Conductor never performs S3 destination resolution. Section 11 renumbered from 9.* - -*2026-04-05 - Section 12 "Conductor Deployment Contract" added: locked invariant.* -* Platform operator is exclusively responsible for deploying Conductor agent mode* -* onto every tenant cluster it forms. Deployment created in ont-system on target* -* cluster using cluster's kubeconfig Secret. role=tenant must be stamped as a* -* first-class field. Absent/incorrect role causes InvariantViolation exit in Conductor.* -* Platform does not deploy Conductor to the management cluster - compiler enable owns* -* that Deployment (role=management). Platform must recreate Deployment on deletion.* -* Conductor image tag must match RunnerConfig.agentImage for the cluster.* - -*2026-04-26 - Section 12 extended: import-mode cluster specifics added. For* -* spec.mode=import clusters, Platform drives a two-site onboarding sequence including* -* namespace creation on both clusters, conductor RBACProfile creation in ont-system* -* (Seam operator profile referencing management-policy/management-maximum), and* -* phase advancement on PermissionSnapshotReceipt acknowledgement. Full sequence* -* specified in guardian-schema.md §20.* - -*2026-04-26 - Section 9 corrected: mode-specific machineconfig provisioning contract* -* added. mode=import: Platform captures machineconfigs from running cluster via Talos* -* COSI API after kubeconfig generation (PLATFORM-BL-MACHINECONFIG-IMPORT-CAPTURE).* -* mode=bootstrap: Platform generates machineconfigs from InfrastructureTalosCluster* -* spec (pending schema amendment PLATFORM-BL-HARDENINGPROFILE-MERGE for node topology).* -* Namespace authority corrected: CP-INV-004 applies to bootstrap/CAPI modes.* -* For mode=import, Compiler bootstrap bundle includes seam-tenant-namespace.yaml so* -* the admin can apply Secrets and TalosCluster CR in a single kubectl apply run.* -* ensureTenantNamespace in the import reconcile path is idempotent safety net only.* - -*2026-05-02 - Section 10 extended: S3 secret key contract and cross-namespace projection* -* added. Admin creates seam-etcd-backup-config in seam-system before submitting any* -* EtcdMaintenance CR. Both provider key conventions accepted: MinIO/Scality camelCase* -* (accessKeyID, secretAccessKey, region, endpoint) and AWS SDK env var names* -* (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT). Reconciler* -* normalizes to AWS SDK env var form and writes a projected secret {em.Name}-s3-env* -* in em.Namespace owned by the EtcdMaintenance CR. Executor Job mounts via envFrom.* -* s3_env_secret.go added to platform/internal/controller.* +*platform.ontai.dev schema -- platform operator* +*Amended 2026-05-13: Full rewrite. seam-core references corrected to seam. TalosCluster and ClusterLog placed under seam.ontai.dev. All platform.ontai.dev types documented from current Go sources. Kueue scope corrected (dispatcher, not platform). wrapper references corrected to dispatcher. All stale type names removed.* diff --git a/go.mod b/go.mod index ba8a2a9..34a7891 100644 --- a/go.mod +++ b/go.mod @@ -4,13 +4,16 @@ go 1.25.3 replace github.com/ontai-dev/conductor => ../conductor -replace github.com/ontai-dev/seam-core => ../seam-core +replace github.com/ontai-dev/seam => ../seam + +replace github.com/ontai-dev/seam-sdk => ../seam-sdk require ( github.com/Masterminds/semver/v3 v3.4.0 github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 - github.com/ontai-dev/seam-core v0.1.0-alpha.0.20260425084313-fa4bedc389f6 + github.com/ontai-dev/seam v0.0.0-00010101000000-000000000000 + github.com/ontai-dev/seam-sdk v0.0.0-00010101000000-000000000000 github.com/siderolabs/talos/pkg/machinery v1.12.6 k8s.io/api v0.35.3 k8s.io/apimachinery v0.35.3 diff --git a/graphify-out/.graphify_python b/graphify-out/.graphify_python new file mode 100644 index 0000000..acab316 --- /dev/null +++ b/graphify-out/.graphify_python @@ -0,0 +1 @@ +/home/saigha01/.local/share/pipx/venvs/graphifyy/bin/python \ No newline at end of file diff --git a/internal/controller/driftsignal_reconciler.go b/internal/controller/driftsignal_reconciler.go index b607e67..25af7e1 100644 --- a/internal/controller/driftsignal_reconciler.go +++ b/internal/controller/driftsignal_reconciler.go @@ -16,19 +16,23 @@ import ( ctrllog "sigs.k8s.io/controller-runtime/pkg/log" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) // DriftSignalReconciler handles cluster-state DriftSignals written by conductor role=tenant. // -// Two signal kinds are handled: +// Three signal kinds are handled: // // - InfrastructureRunnerConfig (T-23): conductor detected RunnerConfig persistently absent. // Response: annotate TalosCluster to trigger RunnerConfig recreation. // -// - InfrastructureTalosCluster: conductor detected Talos version drift (out-of-band upgrade -// on the tenant cluster). Response: patch TalosCluster.status.observedTalosVersion, -// write a synthetic out-of-band TCOR record, bump TCOR revision epoch to observed version. +// - InfrastructureTalosCluster (name prefix "drift-version-"): Talos OS version drift. +// Response: patch TalosCluster.status.observedTalosVersion, write out-of-band TCOR +// record, bump TCOR epoch, create corrective UpgradePolicy (type=talos). +// +// - InfrastructureTalosCluster (name prefix "drift-k8s-version-"): Kubernetes version drift. +// Response: create corrective UpgradePolicy (type=kubernetes) targeting spec.kubernetesVersion. // // conductor DriftSignalHandler skips InfrastructureTalosCluster kind signals; they are // owned exclusively by this reconciler. @@ -40,7 +44,7 @@ type DriftSignalReconciler struct { // // +kubebuilder:rbac:groups=infrastructure.ontai.dev,resources=driftsignals,verbs=get;list;watch;update;patch // +kubebuilder:rbac:groups=infrastructure.ontai.dev,resources=infrastructuretalosclusters,verbs=get;list;watch;update;patch -// +kubebuilder:rbac:groups=infrastructure.ontai.dev,resources=infrastructuretalosclusteroperationresults,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=seam.ontai.dev,resources=clusterlogs,verbs=get;list;watch;update;patch func (r *DriftSignalReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrllog.FromContext(ctx).WithValues("driftsignal", req.NamespacedName) @@ -65,7 +69,10 @@ func (r *DriftSignalReconciler) Reconcile(ctx context.Context, req ctrl.Request) switch ds.Spec.AffectedCRRef.Kind { case "InfrastructureRunnerConfig": return r.handleRunnerConfigDrift(ctx, log, ds, clusterName) - case "InfrastructureTalosCluster": + case "TalosCluster": + if strings.HasPrefix(ds.Name, "drift-k8s-version-") { + return r.handleKubernetesVersionDrift(ctx, log, ds, clusterName) + } return r.handleTalosVersionDrift(ctx, log, ds, clusterName) default: // Other kinds are handled by conductor DriftSignalHandler (pack drift). @@ -189,26 +196,26 @@ func (r *DriftSignalReconciler) patchObservedTalosVersion(ctx context.Context, t // included in the archived revision. func (r *DriftSignalReconciler) appendOutOfBandTCORRecord(ctx context.Context, clusterName, specVersion, observedVersion string) error { ns := tenantNS(clusterName) - tcor := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + tcor := &seamplatformv1alpha1.ClusterLog{} if err := r.Client.Get(ctx, types.NamespacedName{Name: clusterName, Namespace: ns}, tcor); err != nil { if apierrors.IsNotFound(err) { - // TCOR does not exist yet -- nothing to append to; bumpTCORRevision will create it. + // ClusterLog does not exist yet -- nothing to append to; bumpTCORRevision will create it. return nil } - return fmt.Errorf("get TCOR %s/%s: %w", ns, clusterName, err) + return fmt.Errorf("get ClusterLog %s/%s: %w", ns, clusterName, err) } patch := client.MergeFrom(tcor.DeepCopy()) if tcor.Spec.Operations == nil { - tcor.Spec.Operations = map[string]seamcorev1alpha1.TalosClusterOperationRecord{} + tcor.Spec.Operations = map[string]seamplatformv1alpha1.OperationRecord{} } now := metav1.Now() recordKey := fmt.Sprintf("out-of-band-%d", now.UnixNano()) - tcor.Spec.Operations[recordKey] = seamcorev1alpha1.TalosClusterOperationRecord{ + tcor.Spec.Operations[recordKey] = seamplatformv1alpha1.OperationRecord{ Capability: "talos-version-drift", StartedAt: &now, CompletedAt: &now, - Status: seamcorev1alpha1.TalosClusterResultSucceeded, + Status: seamplatformv1alpha1.ResultSucceeded, Message: fmt.Sprintf("talos version changed outside ONT management: %s -> %s", specVersion, observedVersion), } tcor.Spec.OperationCount = int64(len(tcor.Spec.Operations)) @@ -264,6 +271,55 @@ func (r *DriftSignalReconciler) ensureCorrectiveUpgradePolicy(ctx context.Contex return nil } +// handleKubernetesVersionDrift handles DriftSignals emitted by KubernetesVersionDriftLoop. +// It creates a corrective UpgradePolicy (type=kubernetes) targeting the declared +// spec.kubernetesVersion so UpgradePolicyReconciler can submit a kube-upgrade executor Job. +func (r *DriftSignalReconciler) handleKubernetesVersionDrift(ctx context.Context, log logr.Logger, ds *seamcorev1alpha1.DriftSignal, clusterName string) (ctrl.Result, error) { + log.Info("handling Kubernetes version drift", + "cluster", clusterName, "driftReason", ds.Spec.DriftReason) + + tc, err := r.getTalosCluster(ctx, clusterName) + if err != nil { + return ctrl.Result{}, err + } + if tc == nil { + log.Info("TalosCluster not found -- marking queued to stop retries", "cluster", clusterName) + return ctrl.Result{}, r.advanceDriftSignalToQueued(ctx, ds) + } + + if err := r.ensureCorrectiveKubeUpgradePolicy(ctx, clusterName, tc.Spec.KubernetesVersion); err != nil { + return ctrl.Result{}, fmt.Errorf("DriftSignalReconciler: ensure corrective kube UpgradePolicy %s: %w", clusterName, err) + } + log.Info("corrective kube UpgradePolicy ensured", + "cluster", clusterName, "targetVersion", tc.Spec.KubernetesVersion) + + return ctrl.Result{}, r.advanceDriftSignalToQueued(ctx, ds) +} + +// ensureCorrectiveKubeUpgradePolicy creates an UpgradePolicy in seam-tenant-{cluster} to +// bring the cluster back to specVersion (the declared spec.kubernetesVersion). Idempotent. +func (r *DriftSignalReconciler) ensureCorrectiveKubeUpgradePolicy(ctx context.Context, clusterName, specVersion string) error { + ns := tenantNS(clusterName) + up := &platformv1alpha1.UpgradePolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: "drift-k8s-version-" + clusterName, + Namespace: ns, + }, + Spec: platformv1alpha1.UpgradePolicySpec{ + ClusterRef: platformv1alpha1.LocalObjectRef{ + Name: clusterName, + Namespace: rbacProfileNamespace, + }, + UpgradeType: platformv1alpha1.UpgradeTypeKubernetes, + TargetKubernetesVersion: specVersion, + }, + } + if err := r.Client.Create(ctx, up); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create UpgradePolicy drift-k8s-version-%s: %w", clusterName, err) + } + return nil +} + // extractObservedVersion parses the observed talos version from a driftReason string // produced by TalosVersionDriftLoop. Format: "talos version drift: spec={x} observed={y}". func extractObservedVersion(driftReason string) string { diff --git a/internal/controller/driftsignal_reconciler_test.go b/internal/controller/driftsignal_reconciler_test.go index 5963df0..cbe6797 100644 --- a/internal/controller/driftsignal_reconciler_test.go +++ b/internal/controller/driftsignal_reconciler_test.go @@ -13,7 +13,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) // buildDriftSignalTestScheme returns a scheme for DriftSignalReconciler unit tests. @@ -23,6 +24,9 @@ func buildDriftSignalTestScheme(t *testing.T) *runtime.Scheme { if err := clientgoscheme.AddToScheme(s); err != nil { t.Fatalf("add clientgo scheme: %v", err) } + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatformv1alpha1 scheme: %v", err) + } if err := seamcorev1alpha1.AddToScheme(s); err != nil { t.Fatalf("add seamcorev1alpha1 scheme: %v", err) } @@ -67,15 +71,15 @@ func fakeTalosClusterForDrift(name string) *platformv1alpha1.TalosCluster { } } -// fakeTCOR builds a minimal InfrastructureTalosClusterOperationResult for DriftSignal tests. -func fakeTCOR(clusterName, talosVersion string) *seamcorev1alpha1.InfrastructureTalosClusterOperationResult { - return &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{ +// fakeTCOR builds a minimal ClusterLog for DriftSignal tests. +func fakeTCOR(clusterName, talosVersion string) *seamplatformv1alpha1.ClusterLog { + return &seamplatformv1alpha1.ClusterLog{ ObjectMeta: metav1.ObjectMeta{ Name: clusterName, Namespace: tenantNS(clusterName), ResourceVersion: "1", }, - Spec: seamcorev1alpha1.InfrastructureTalosClusterOperationResultSpec{ + Spec: seamplatformv1alpha1.ClusterLogSpec{ ClusterRef: clusterName, TalosVersion: talosVersion, Revision: 1, @@ -96,8 +100,8 @@ func fakeDriftSignalWithVersion(name, ns, specVersion, observedVersion string) * CorrelationID: "test-version-correlation-id", ObservedAt: metav1.Now(), AffectedCRRef: seamcorev1alpha1.DriftAffectedCRRef{ - Group: "infrastructure.ontai.dev", - Kind: "InfrastructureTalosCluster", + Group: "seam.ontai.dev", + Kind: "TalosCluster", Name: "ccs-dev", }, DriftReason: "talos version drift: spec=" + specVersion + " observed=" + observedVersion, @@ -296,8 +300,8 @@ func TestDriftSignalReconciler_TalosVersionDrift_FullFlow(t *testing.T) { gotTC.Status.ObservedTalosVersion, observedVersion) } - // TCOR must have been bumped to the observed version and have an out-of-band record. - gotTCOR := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + // ClusterLog must have been bumped to the observed version and have an out-of-band record. + gotTCOR := &seamplatformv1alpha1.ClusterLog{} if err := c.Get(context.Background(), types.NamespacedName{Name: clusterName, Namespace: tenantNSName}, gotTCOR); err != nil { t.Fatalf("get TCOR: %v", err) } @@ -335,6 +339,85 @@ func TestDriftSignalReconciler_TalosVersionDrift_FullFlow(t *testing.T) { } } +// TestDriftSignalReconciler_K8sVersionDrift_CreatesUpgradePolicy verifies that a pending +// DriftSignal named "drift-k8s-version-{cluster}" with kind=InfrastructureTalosCluster causes: +// - A corrective UpgradePolicy (type=kubernetes) targeting spec.kubernetesVersion +// - The DriftSignal advanced to queued +func TestDriftSignalReconciler_K8sVersionDrift_CreatesUpgradePolicy(t *testing.T) { + scheme := buildDriftSignalTestScheme(t) + if err := platformv1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("add platform scheme: %v", err) + } + + clusterName := "ccs-dev" + tenantNSName := tenantNS(clusterName) + signalName := "drift-k8s-version-" + clusterName + + ds := &seamcorev1alpha1.DriftSignal{ + ObjectMeta: metav1.ObjectMeta{ + Name: signalName, Namespace: tenantNSName, ResourceVersion: "1", + }, + Spec: seamcorev1alpha1.DriftSignalSpec{ + State: seamcorev1alpha1.DriftSignalStatePending, + CorrelationID: "k8s-version-ccs-dev-123", + ObservedAt: metav1.Now(), + AffectedCRRef: seamcorev1alpha1.DriftAffectedCRRef{ + Group: "seam.ontai.dev", + Kind: "TalosCluster", + Name: clusterName, + }, + DriftReason: "kubernetes version drift: spec=1.32.2 observed=1.32.3", + }, + } + + tc := fakeTalosClusterForDrift(clusterName) + tc.Spec.KubernetesVersion = "1.32.2" + + tenantNamespaceObj := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: tenantNSName}, + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ds, tc, tenantNamespaceObj). + WithStatusSubresource(&seamcorev1alpha1.DriftSignal{}). + Build() + + r := &DriftSignalReconciler{Client: c} + + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: signalName, Namespace: tenantNSName}, + }) + if err != nil { + t.Fatalf("Reconcile: %v", err) + } + + // UpgradePolicy must be created with type=kubernetes targeting spec.kubernetesVersion. + gotUP := &platformv1alpha1.UpgradePolicy{} + if err := c.Get(context.Background(), types.NamespacedName{ + Name: signalName, Namespace: tenantNSName, + }, gotUP); err != nil { + t.Fatalf("get corrective kube UpgradePolicy: %v", err) + } + if gotUP.Spec.UpgradeType != platformv1alpha1.UpgradeTypeKubernetes { + t.Errorf("UpgradePolicy.Spec.UpgradeType = %q, want %q", + gotUP.Spec.UpgradeType, platformv1alpha1.UpgradeTypeKubernetes) + } + if gotUP.Spec.TargetKubernetesVersion != "1.32.2" { + t.Errorf("UpgradePolicy.Spec.TargetKubernetesVersion = %q, want 1.32.2", + gotUP.Spec.TargetKubernetesVersion) + } + + // DriftSignal must be advanced to queued. + gotDS := &seamcorev1alpha1.DriftSignal{} + if err := c.Get(context.Background(), types.NamespacedName{Name: signalName, Namespace: tenantNSName}, gotDS); err != nil { + t.Fatalf("get DriftSignal: %v", err) + } + if gotDS.Spec.State != seamcorev1alpha1.DriftSignalStateQueued { + t.Errorf("DriftSignal.Spec.State = %q, want queued", gotDS.Spec.State) + } +} + // TestDriftSignalReconciler_TalosVersionDrift_NoParsableVersion_AdvancesToQueued verifies // that a version drift signal without a parseable observed version is still advanced to queued // (does not retry indefinitely). @@ -353,8 +436,8 @@ func TestDriftSignalReconciler_TalosVersionDrift_NoParsableVersion_AdvancesToQue CorrelationID: "test-no-version", ObservedAt: metav1.Now(), AffectedCRRef: seamcorev1alpha1.DriftAffectedCRRef{ - Group: "infrastructure.ontai.dev", - Kind: "InfrastructureTalosCluster", + Group: "seam.ontai.dev", + Kind: "TalosCluster", Name: clusterName, }, DriftReason: "talos version drift: no version info", diff --git a/internal/controller/operational_job_base.go b/internal/controller/operational_job_base.go index 1f7670e..828614d 100644 --- a/internal/controller/operational_job_base.go +++ b/internal/controller/operational_job_base.go @@ -24,7 +24,7 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" ) const ( @@ -154,7 +154,7 @@ func tenantNS(clusterRef string) string { // Returns (false, false, "") when the TCOR does not yet exist or the // record has not been written yet — the Job is still running. func readOperationRecord(ctx context.Context, c client.Client, clusterRef, jobName string) (complete, failed bool, message string) { - tcor := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + tcor := &seamplatformv1alpha1.ClusterLog{} if err := c.Get(ctx, types.NamespacedName{Name: clusterRef, Namespace: tenantNS(clusterRef)}, tcor); err != nil { return false, false, "" } @@ -163,9 +163,9 @@ func readOperationRecord(ctx context.Context, c client.Client, clusterRef, jobNa return false, false, "" } switch rec.Status { - case seamcorev1alpha1.TalosClusterResultSucceeded: + case seamplatformv1alpha1.ResultSucceeded: return true, false, rec.Message - case seamcorev1alpha1.TalosClusterResultFailed: + case seamplatformv1alpha1.ResultFailed: msg := rec.Message if rec.FailureReason != nil && rec.FailureReason.Reason != "" { msg = rec.FailureReason.Reason @@ -179,19 +179,19 @@ func readOperationRecord(ctx context.Context, c client.Client, clusterRef, jobNa // does not yet exist. Called by ensureTenantExecutorResources on cluster admission. func ensureTCOR(ctx context.Context, c client.Client, clusterRef, talosVersion string) error { ns := tenantNS(clusterRef) - tcor := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + tcor := &seamplatformv1alpha1.ClusterLog{} if err := c.Get(ctx, types.NamespacedName{Name: clusterRef, Namespace: ns}, tcor); err == nil { return nil } else if !apierrors.IsNotFound(err) { return fmt.Errorf("ensureTCOR: get TCOR %s/%s: %w", ns, clusterRef, err) } - tcor = &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{ + tcor = &seamplatformv1alpha1.ClusterLog{ ObjectMeta: metav1.ObjectMeta{ Name: clusterRef, Namespace: ns, Labels: map[string]string{"platform.ontai.dev/cluster": clusterRef}, }, - Spec: seamcorev1alpha1.InfrastructureTalosClusterOperationResultSpec{ + Spec: seamplatformv1alpha1.ClusterLogSpec{ ClusterRef: clusterRef, TalosVersion: talosVersion, Revision: 1, @@ -208,7 +208,7 @@ func ensureTCOR(ctx context.Context, c client.Client, clusterRef, talosVersion s // Called by UpgradePolicyReconciler after a successful talosVersion upgrade. func bumpTCORRevision(ctx context.Context, c client.Client, clusterRef, newTalosVersion string) error { ns := tenantNS(clusterRef) - tcor := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + tcor := &seamplatformv1alpha1.ClusterLog{} if err := c.Get(ctx, types.NamespacedName{Name: clusterRef, Namespace: ns}, tcor); err != nil { if apierrors.IsNotFound(err) { return ensureTCOR(ctx, c, clusterRef, newTalosVersion) diff --git a/internal/controller/pki_cert_helpers.go b/internal/controller/pki_cert_helpers.go index 5d2e68c..53acadd 100644 --- a/internal/controller/pki_cert_helpers.go +++ b/internal/controller/pki_cert_helpers.go @@ -4,7 +4,7 @@ package controller // // The reconciler reads the kubeconfig and talosconfig Secrets for an imported // cluster, parses the embedded X.509 certificates, and writes the earliest expiry -// into InfrastructureTalosCluster.status.pkiExpiryDate. When the expiry is within +// into TalosCluster.status.pkiExpiryDate. When the expiry is within // spec.pkiRotationThresholdDays of the current time, a PKIRotation CR is created // automatically. platform-schema.md §13. @@ -26,7 +26,6 @@ import ( "sigs.k8s.io/yaml" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" ) // defaultPKIThreshold is the default number of days before cert expiry to @@ -197,7 +196,7 @@ func readSecretAndParseExpiry( // syncPKIExpiry calls detectClusterPKIExpiry, writes the result to // tc.Status.PkiExpiryDate (modifying in place), and returns rotationNeeded=true // when the expiry is within the configured threshold. platform-schema.md §13. -func syncPKIExpiry(ctx context.Context, c client.Client, tc *seamcorev1alpha1.InfrastructureTalosCluster) (bool, error) { +func syncPKIExpiry(ctx context.Context, c client.Client, tc *platformv1alpha1.TalosCluster) (bool, error) { expiry, err := detectClusterPKIExpiry(ctx, c, tc.Name) if err != nil { return false, err @@ -225,7 +224,7 @@ func syncPKIExpiry(ctx context.Context, c client.Client, tc *seamcorev1alpha1.In // by an approaching cert expiry. It is idempotent: if a PKIRotation CR for this // cluster already exists and is not yet complete, no duplicate is created. // platform-schema.md §13. -func ensureAutoRotationPKI(ctx context.Context, c client.Client, _ *runtime.Scheme, tc *seamcorev1alpha1.InfrastructureTalosCluster) error { +func ensureAutoRotationPKI(ctx context.Context, c client.Client, _ *runtime.Scheme, tc *platformv1alpha1.TalosCluster) error { ns := importSecretsNamespace(tc.Name) existing := &platformv1alpha1.PKIRotationList{} @@ -272,7 +271,7 @@ func ensureAutoRotationPKI(ctx context.Context, c client.Client, _ *runtime.Sche // annotation-triggered rotation. The annotation platform.ontai.dev/rotate-pki=true // has already been detected by the caller; this function creates the CR. // The caller removes the annotation after this returns. platform-schema.md §13. -func ensureAnnotationRotationPKI(ctx context.Context, c client.Client, _ *runtime.Scheme, tc *seamcorev1alpha1.InfrastructureTalosCluster) error { +func ensureAnnotationRotationPKI(ctx context.Context, c client.Client, _ *runtime.Scheme, tc *platformv1alpha1.TalosCluster) error { ns := importSecretsNamespace(tc.Name) ts := time.Now().UTC().Format("20060102t150405") diff --git a/internal/controller/runnerconfig_cr.go b/internal/controller/runnerconfig_cr.go index 5e74568..9fd6774 100644 --- a/internal/controller/runnerconfig_cr.go +++ b/internal/controller/runnerconfig_cr.go @@ -1,18 +1,21 @@ package controller -// RunnerConfig and OperationResult types are owned by seam-core (infrastructure.ontai.dev/v1alpha1). +// RunnerConfig types are owned by seam-core (infrastructure.ontai.dev/v1alpha1). +// ClusterLog (OperationResult) is owned by platform (seam.ontai.dev/v1alpha1). // Platform reconcilers reference these aliases through the controller package. -// Replaces the previous AddKnownTypeWithName workaround for runner.ontai.dev/v1alpha1. -// T-2B-8. +// T-2B-8, MIGRATION-3.2. -import seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" +import ( + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" +) -// Type aliases -- struct definitions are in seam-core. These preserve the -// controller package interface for all day-2 reconcilers without source edits. +// Type aliases -- struct definitions live in the owning packages. These preserve +// the controller package interface for all day-2 reconcilers without source edits. type ( - OperationalRunnerConfig = seamcorev1alpha1.InfrastructureRunnerConfig - OperationalRunnerConfigList = seamcorev1alpha1.InfrastructureRunnerConfigList - OperationalRunnerConfigSpec = seamcorev1alpha1.InfrastructureRunnerConfigSpec + OperationalRunnerConfig = seamcorev1alpha1.RunnerConfig + OperationalRunnerConfigList = seamcorev1alpha1.RunnerConfigList + OperationalRunnerConfigSpec = seamcorev1alpha1.RunnerConfigSpec // OperationalStep is an alias for RunnerConfigStep. OperationalStep = seamcorev1alpha1.RunnerConfigStep @@ -20,11 +23,11 @@ type ( // CapabilityEntry is an alias for RunnerCapabilityEntry. CapabilityEntry = seamcorev1alpha1.RunnerCapabilityEntry - // OperationalRunnerConfigStatus is an alias for InfrastructureRunnerConfigStatus. - OperationalRunnerConfigStatus = seamcorev1alpha1.InfrastructureRunnerConfigStatus + // OperationalRunnerConfigStatus is an alias for RunnerConfigStatus. + OperationalRunnerConfigStatus = seamcorev1alpha1.RunnerConfigStatus - // TalosClusterOperationResult is the day-2 operation result CR written by the - // Conductor execute-mode Job. One CR per Job, in the Job namespace (ont-system). - TalosClusterOperationResult = seamcorev1alpha1.InfrastructureTalosClusterOperationResult - TalosClusterOperationResultList = seamcorev1alpha1.InfrastructureTalosClusterOperationResultList + // TalosClusterOperationResult is the day-2 operation result CR (ClusterLog) written + // by the Conductor execute-mode Job. One CR per cluster, in seam-tenant-{clusterRef}. + TalosClusterOperationResult = seamplatformv1alpha1.ClusterLog + TalosClusterOperationResultList = seamplatformv1alpha1.ClusterLogList ) diff --git a/internal/controller/taloscluster_bootstrap_hardening_test.go b/internal/controller/taloscluster_bootstrap_hardening_test.go index f9fa0b7..83ccf75 100644 --- a/internal/controller/taloscluster_bootstrap_hardening_test.go +++ b/internal/controller/taloscluster_bootstrap_hardening_test.go @@ -10,7 +10,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) // buildHardeningTestScheme registers all types needed for ensureBootstrapHardening tests. @@ -20,6 +21,9 @@ func buildHardeningTestScheme(t *testing.T) *runtime.Scheme { if err := clientgoscheme.AddToScheme(s); err != nil { t.Fatalf("add clientgo: %v", err) } + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatform: %v", err) + } if err := seamcorev1alpha1.AddToScheme(s); err != nil { t.Fatalf("add seamcore: %v", err) } diff --git a/internal/controller/taloscluster_helpers.go b/internal/controller/taloscluster_helpers.go index 2b62ca0..b2e52db 100644 --- a/internal/controller/taloscluster_helpers.go +++ b/internal/controller/taloscluster_helpers.go @@ -24,7 +24,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) const ( @@ -913,14 +913,15 @@ func EnsureRemoteConductorRBAC(ctx context.Context, k8s kubernetes.Interface) er Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}, }, { - APIGroups: []string{"infrastructure.ontai.dev"}, - Resources: []string{"infrastructuretalosclusters/status"}, - Verbs: []string{"update", "patch"}, + // TalosCluster (seam.ontai.dev) read access for drift detection on tenant cluster. + APIGroups: []string{"seam.ontai.dev"}, + Resources: []string{"talosclusters", "talosclusters/status"}, + Verbs: []string{"get", "list", "watch", "create", "update", "patch"}, }, { - // RBACProfilePullLoop and RBACPolicyPullLoop SSA-patch security.ontai.dev + // RBACProfilePullLoop and RBACPolicyPullLoop SSA-patch guardian.ontai.dev // resources into ont-system. Needs create/update/patch in addition to read. - APIGroups: []string{"security.ontai.dev"}, + APIGroups: []string{"guardian.ontai.dev"}, Resources: []string{"*"}, Verbs: []string{"get", "list", "watch", "create", "update", "patch"}, }, @@ -1006,9 +1007,9 @@ func EnsureRemoteConductorRBAC(ctx context.Context, k8s kubernetes.Interface) er // SC-INV-003: seam-core CRDs are installed before all operators. func EnsureRemoteTalosClusterCopy(ctx context.Context, dynClient dynamic.Interface, tc *platformv1alpha1.TalosCluster) error { gvr := schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Resource: "infrastructuretalosclusters", + Resource: "talosclusters", } // Idempotency: skip if the CR already exists. @@ -1022,8 +1023,8 @@ func EnsureRemoteTalosClusterCopy(ctx context.Context, dynClient dynamic.Interfa obj := &unstructured.Unstructured{ Object: map[string]interface{}{ - "apiVersion": "infrastructure.ontai.dev/v1alpha1", - "kind": "InfrastructureTalosCluster", + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "TalosCluster", "metadata": map[string]interface{}{ "name": tc.Name, "namespace": conductorAgentNamespace, @@ -1051,7 +1052,7 @@ func EnsureRemoteTalosClusterCopy(ctx context.Context, dynClient dynamic.Interfa if apierrors.IsNotFound(err) { return nil } - return fmt.Errorf("ensureRemoteTalosClusterCopy: create InfrastructureTalosCluster on %s: %w", tc.Name, err) + return fmt.Errorf("ensureRemoteTalosClusterCopy: create TalosCluster on %s: %w", tc.Name, err) } return nil } @@ -1465,34 +1466,34 @@ func (r *TalosClusterReconciler) ensureLocalQueue( return nil } -// rbacPolicyGVK is the GVK for guardian RBACPolicy (security.ontai.dev/v1alpha1). +// rbacPolicyGVK is the GVK for guardian RBACPolicy (guardian.ontai.dev/v1alpha1). var rbacPolicyGVK = schema.GroupVersionKind{ - Group: "security.ontai.dev", + Group: "guardian.ontai.dev", Version: "v1alpha1", Kind: "RBACPolicy", } -// rbacProfileGVK is the GVK for guardian RBACProfile (security.ontai.dev/v1alpha1). +// rbacProfileGVK is the GVK for guardian RBACProfile (guardian.ontai.dev/v1alpha1). var rbacProfileGVK = schema.GroupVersionKind{ - Group: "security.ontai.dev", + Group: "guardian.ontai.dev", Version: "v1alpha1", Kind: "RBACProfile", } -// packExecutionTenantGVK is the GVK for InfrastructurePackExecution CRs in -// the tenant namespace. Owned by seam-core. Decision G. +// packExecutionTenantGVK is the GVK for PackExecution CRs in +// the tenant namespace. Owned by wrapper. MIGRATION-3.2. var packExecutionTenantGVK = schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Kind: "InfrastructurePackExecution", + Kind: "PackExecution", } -// packInstanceTenantGVK is the GVK for InfrastructurePackInstance CRs in -// the tenant namespace. Owned by seam-core. Decision G. +// packInstanceTenantGVK is the GVK for PackInstalled CRs in +// the tenant namespace. Owned by wrapper. MIGRATION-3.2. var packInstanceTenantGVK = schema.GroupVersionKind{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Kind: "InfrastructurePackInstance", + Kind: "PackInstalled", } // rbacPolicyNamespace is the namespace where the platform-wide RBACPolicy lives. @@ -1777,7 +1778,7 @@ func (r *TalosClusterReconciler) ensureWrapperRunnerResources(ctx context.Contex {APIGroups: []string{"autoscaling"}, Resources: []string{"horizontalpodautoscalers"}, Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}}, {APIGroups: []string{"infrastructure.ontai.dev"}, Resources: []string{"infrastructurepackexecutions", "infrastructureclusterpacks", "infrastructurepackinstances"}, Verbs: []string{"get", "list", "watch"}}, {APIGroups: []string{"infrastructure.ontai.dev"}, Resources: []string{"infrastructurerunnerconfigs"}, Verbs: []string{"get", "list", "watch", "patch", "update"}}, - {APIGroups: []string{"security.ontai.dev"}, Resources: []string{"rbacprofiles"}, Verbs: []string{"get", "list", "watch"}}, + {APIGroups: []string{"guardian.ontai.dev"}, Resources: []string{"rbacprofiles"}, Verbs: []string{"get", "list", "watch"}}, {APIGroups: []string{"infrastructure.ontai.dev"}, Resources: []string{"packoperationresults"}, Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}}, }, } diff --git a/internal/controller/taloscluster_helpers_test.go b/internal/controller/taloscluster_helpers_test.go index 1ff2ef0..e0dd5fd 100644 --- a/internal/controller/taloscluster_helpers_test.go +++ b/internal/controller/taloscluster_helpers_test.go @@ -15,7 +15,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" ) // buildHelperTestScheme constructs a runtime.Scheme with all types required for @@ -26,13 +26,23 @@ func buildHelperTestScheme(t *testing.T) *runtime.Scheme { if err := clientgoscheme.AddToScheme(s); err != nil { t.Fatalf("add clientgo scheme: %v", err) } - // seamcorev1alpha1 registers InfrastructurePackExecution, InfrastructurePackInstance, - // InfrastructureTalosCluster (TalosCluster alias), and DriftSignal under - // infrastructure.ontai.dev/v1alpha1. Do not re-register these as unstructured. - if err := seamcorev1alpha1.AddToScheme(s); err != nil { - t.Fatalf("add seamcorev1alpha1 scheme: %v", err) + // seamplatformv1alpha1 registers TalosCluster under seam.ontai.dev/v1alpha1. + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatformv1alpha1 scheme: %v", err) } - // security.ontai.dev types (RBACPolicy, RBACProfile) are not in seam-core; + // PackExecution and PackInstalled are owned by wrapper (seam.ontai.dev/v1alpha1). + // Register as unstructured so the fake client can store/retrieve them. + s.AddKnownTypeWithName(packExecutionTenantGVK, &unstructured.Unstructured{}) + s.AddKnownTypeWithName( + packExecutionTenantGVK.GroupVersion().WithKind(packExecutionTenantGVK.Kind+"List"), + &unstructured.UnstructuredList{}, + ) + s.AddKnownTypeWithName(packInstanceTenantGVK, &unstructured.Unstructured{}) + s.AddKnownTypeWithName( + packInstanceTenantGVK.GroupVersion().WithKind(packInstanceTenantGVK.Kind+"List"), + &unstructured.UnstructuredList{}, + ) + // guardian.ontai.dev types (RBACPolicy, RBACProfile) are not in seam-core; // register as unstructured so the fake client can list/patch them. s.AddKnownTypeWithName(rbacPolicyGVK, &unstructured.Unstructured{}) s.AddKnownTypeWithName( @@ -47,28 +57,24 @@ func buildHelperTestScheme(t *testing.T) *runtime.Scheme { return s } -// fakePackExecution builds a minimal InfrastructurePackExecution typed object. -// The fake client stores it by GVK; the reconciler can list/delete it as unstructured. -func fakePackExecution(name, ns string) *seamcorev1alpha1.InfrastructurePackExecution { - return &seamcorev1alpha1.InfrastructurePackExecution{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: ns, - ResourceVersion: "1", - }, - } +// fakePackExecution builds a minimal PackExecution unstructured object. +func fakePackExecution(name, ns string) *unstructured.Unstructured { + obj := &unstructured.Unstructured{} + obj.SetGroupVersionKind(packExecutionTenantGVK) + obj.SetName(name) + obj.SetNamespace(ns) + obj.SetResourceVersion("1") + return obj } -// fakePackInstance builds a minimal InfrastructurePackInstance typed object. -// The fake client stores it by GVK; the reconciler can list/delete it as unstructured. -func fakePackInstance(name, ns string) *seamcorev1alpha1.InfrastructurePackInstance { - return &seamcorev1alpha1.InfrastructurePackInstance{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: ns, - ResourceVersion: "1", - }, - } +// fakePackInstance builds a minimal PackInstalled unstructured object. +func fakePackInstance(name, ns string) *unstructured.Unstructured { + obj := &unstructured.Unstructured{} + obj.SetGroupVersionKind(packInstanceTenantGVK) + obj.SetName(name) + obj.SetNamespace(ns) + obj.SetResourceVersion("1") + return obj } // fakeRBACPolicy builds a minimal guardian RBACPolicy unstructured object with @@ -158,15 +164,17 @@ func TestHandleTalosClusterDeletion_DecisionHCascade_DeletesPackExecutions(t *te } // PackExecution must be deleted. - peGet := &seamcorev1alpha1.InfrastructurePackExecution{} + peGet := &unstructured.Unstructured{} + peGet.SetGroupVersionKind(packExecutionTenantGVK) if err := c.Get(context.Background(), types.NamespacedName{Name: "nginx-pack-exec", Namespace: tenantNS}, peGet); err == nil { t.Error("expected PackExecution to be deleted but it still exists") } - // PackInstance must be deleted. - piGet := &seamcorev1alpha1.InfrastructurePackInstance{} + // PackInstalled must be deleted. + piGet := &unstructured.Unstructured{} + piGet.SetGroupVersionKind(packInstanceTenantGVK) if err := c.Get(context.Background(), types.NamespacedName{Name: "nginx-pack-inst", Namespace: tenantNS}, piGet); err == nil { - t.Error("expected PackInstance to be deleted but it still exists") + t.Error("expected PackInstalled to be deleted but it still exists") } // finalizerDecisionHCascade must be removed. The fake client GC's the object once diff --git a/internal/controller/taloscluster_version_upgrade.go b/internal/controller/taloscluster_version_upgrade.go index 9d85904..4d7de16 100644 --- a/internal/controller/taloscluster_version_upgrade.go +++ b/internal/controller/taloscluster_version_upgrade.go @@ -5,10 +5,13 @@ package controller // // Version upgrade path: // - spec.versionUpgrade=true on a Ready cluster auto-creates an UpgradePolicy CR. +// - Upgrade type derives from which version fields are set: +// talosVersion only → UpgradeTypeTalos; kubernetesVersion only → UpgradeTypeKubernetes; +// both → UpgradeTypeStack (sequential Talos then k8s). // - The UpgradePolicy reconciler drives the Conductor Job. // - On completion, UpgradePolicy reconciler patches status.observedTalosVersion. -// - TalosClusterReconciler detects UpgradePolicy Ready=True and clears -// spec.versionUpgrade via spec patch, setting VersionUpgradePending=False. +// - TalosClusterReconciler detects UpgradePolicy Ready=True and sets +// VersionUpgradePending=False. // // Anti-regression: // - If spec.talosVersion < status.observedTalosVersion, the reconciler sets @@ -109,67 +112,95 @@ func (r *TalosClusterReconciler) reconcileVersionUpgrade(ctx context.Context, tc return false, ctrl.Result{}, nil } - // spec.versionUpgrade=true: validate that talosVersion is set. - if tc.Spec.TalosVersion == "" { + // Determine which version fields are set. + hasTalos := tc.Spec.TalosVersion != "" + hasKube := tc.Spec.KubernetesVersion != "" + + // At least one target version must be present. + if !hasTalos && !hasKube { platformv1alpha1.SetCondition( &tc.Status.Conditions, platformv1alpha1.ConditionTypePhaseFailed, metav1.ConditionTrue, platformv1alpha1.ReasonTalosVersionRequired, - "spec.versionUpgrade=true requires spec.talosVersion to be set to the target version.", + "spec.versionUpgrade=true requires spec.talosVersion, spec.kubernetesVersion, or both.", tc.Generation, ) return true, ctrl.Result{}, nil } - // Anti-regression: if the specified version would downgrade, block. - if checkVersionRegression(tc) { + // Anti-regression guard applies only when a Talos version change is requested. + if hasTalos && checkVersionRegression(tc) { return true, ctrl.Result{}, nil } + // Derive upgrade type from which fields are populated. + var upgradeType platformv1alpha1.UpgradeType + switch { + case hasTalos && hasKube: + upgradeType = platformv1alpha1.UpgradeTypeStack + case hasTalos: + upgradeType = platformv1alpha1.UpgradeTypeTalos + default: + upgradeType = platformv1alpha1.UpgradeTypeKubernetes + } + upName := tc.Name + versionUpgradeSuffix + // UpgradePolicy lives in the tenant namespace so the Conductor executor Job + // that processes it runs in the same namespace as the platform-executor SA + // and the talosconfig Secret (both provisioned by ensureTenantExecutorResources + // and ensureExecutorTalosconfig respectively). + upNamespace := "seam-tenant-" + tc.Name // Check if the UpgradePolicy already exists. existing := &platformv1alpha1.UpgradePolicy{} - err = r.Client.Get(ctx, types.NamespacedName{Name: upName, Namespace: tc.Namespace}, existing) + err = r.Client.Get(ctx, types.NamespacedName{Name: upName, Namespace: upNamespace}, existing) if err != nil && !apierrors.IsNotFound(err) { return true, ctrl.Result{}, fmt.Errorf("reconcileVersionUpgrade: get UpgradePolicy: %w", err) } if apierrors.IsNotFound(err) { - // Create the UpgradePolicy. + upSpec := platformv1alpha1.UpgradePolicySpec{ + ClusterRef: platformv1alpha1.LocalObjectRef{Name: tc.Name, Namespace: tc.Namespace}, + UpgradeType: upgradeType, + RollingStrategy: platformv1alpha1.RollingStrategySequential, + } + if hasTalos { + upSpec.TargetTalosVersion = tc.Spec.TalosVersion + } + if hasKube { + upSpec.TargetKubernetesVersion = tc.Spec.KubernetesVersion + } up := &platformv1alpha1.UpgradePolicy{ ObjectMeta: metav1.ObjectMeta{ Name: upName, - Namespace: tc.Namespace, + Namespace: upNamespace, Labels: map[string]string{ labelVersionUpgradeOwned: "true", "platform.ontai.dev/cluster": tc.Name, }, }, - Spec: platformv1alpha1.UpgradePolicySpec{ - ClusterRef: platformv1alpha1.LocalObjectRef{Name: tc.Name, Namespace: tc.Namespace}, - UpgradeType: platformv1alpha1.UpgradeTypeTalos, - TargetTalosVersion: tc.Spec.TalosVersion, - RollingStrategy: platformv1alpha1.RollingStrategySequential, - }, + Spec: upSpec, } if err := r.Client.Create(ctx, up); err != nil { return true, ctrl.Result{}, fmt.Errorf("reconcileVersionUpgrade: create UpgradePolicy: %w", err) } + msg := fmt.Sprintf("UpgradePolicy %s created for %s upgrade (talos=%s kubernetes=%s).", + upName, upgradeType, tc.Spec.TalosVersion, tc.Spec.KubernetesVersion) platformv1alpha1.SetCondition( &tc.Status.Conditions, platformv1alpha1.ConditionTypeVersionUpgradePending, metav1.ConditionTrue, platformv1alpha1.ReasonVersionUpgradeSubmitted, - fmt.Sprintf("UpgradePolicy %s created for Talos version upgrade to %s.", upName, tc.Spec.TalosVersion), + msg, tc.Generation, ) r.Recorder.Eventf(tc, nil, "Normal", "VersionUpgradeSubmitted", "VersionUpgradeSubmitted", - "Created UpgradePolicy %s to upgrade cluster %s to Talos %s", - upName, tc.Name, tc.Spec.TalosVersion) + "Created UpgradePolicy %s/%s for cluster %s (%s)", upNamespace, upName, tc.Name, upgradeType) logger.Info("created UpgradePolicy for spec.versionUpgrade", - "cluster", tc.Name, "upgradePolicyName", upName, "targetVersion", tc.Spec.TalosVersion) + "cluster", tc.Name, "upgradePolicyName", upName, "upgradePolicyNamespace", upNamespace, + "upgradeType", upgradeType, + "talosVersion", tc.Spec.TalosVersion, "kubernetesVersion", tc.Spec.KubernetesVersion) return true, ctrl.Result{RequeueAfter: operationalJobPollInterval}, nil } @@ -199,13 +230,12 @@ func (r *TalosClusterReconciler) reconcileVersionUpgrade(ctx context.Context, tc platformv1alpha1.ConditionTypeVersionUpgradePending, metav1.ConditionFalse, platformv1alpha1.ReasonVersionUpgradeComplete, - fmt.Sprintf("UpgradePolicy %s completed. Cluster upgraded to Talos %s.", upName, tc.Spec.TalosVersion), + fmt.Sprintf("UpgradePolicy %s completed (%s).", upName, upgradeType), tc.Generation, ) r.Recorder.Eventf(tc, nil, "Normal", "VersionUpgradeComplete", "VersionUpgradeComplete", - "Cluster %s upgraded to Talos %s via UpgradePolicy %s", - tc.Name, tc.Spec.TalosVersion, upName) + "Cluster %s completed %s upgrade via UpgradePolicy %s", tc.Name, upgradeType, upName) logger.Info("version upgrade complete via UpgradePolicy", - "cluster", tc.Name, "version", tc.Spec.TalosVersion) + "cluster", tc.Name, "upgradeType", upgradeType) return true, ctrl.Result{}, nil } diff --git a/internal/controller/tcor_graphquery_stub.go b/internal/controller/tcor_graphquery_stub.go index 7c2e2a5..eb4b939 100644 --- a/internal/controller/tcor_graphquery_stub.go +++ b/internal/controller/tcor_graphquery_stub.go @@ -14,14 +14,14 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" ) // stubDumpTCORRevisionToGraphQueryDB archives the completed revision of a cluster -// TCOR before its Operations list is cleared on talosVersion upgrade. +// ClusterLog before its Operations list is cleared on talosVersion upgrade. // When the GraphQuery DB service is implemented, this stub will be replaced by // a real gRPC or HTTP write to the persistence layer. -func stubDumpTCORRevisionToGraphQueryDB(ctx context.Context, clusterRef string, revision int64, talosVersion string, ops map[string]seamcorev1alpha1.TalosClusterOperationRecord) { +func stubDumpTCORRevisionToGraphQueryDB(ctx context.Context, clusterRef string, revision int64, talosVersion string, ops map[string]seamplatformv1alpha1.OperationRecord) { logger := log.FromContext(ctx) logger.V(1).Info("stub: would archive TCOR revision to GraphQuery DB", "cluster", clusterRef, diff --git a/internal/controller/upgradepolicy_reconciler.go b/internal/controller/upgradepolicy_reconciler.go index 08d9079..ec3055d 100644 --- a/internal/controller/upgradepolicy_reconciler.go +++ b/internal/controller/upgradepolicy_reconciler.go @@ -32,7 +32,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" ) const ( @@ -420,7 +419,7 @@ func (r *UpgradePolicyReconciler) patchObservedTalosVersion(ctx context.Context, if clusterNS == "" { clusterNS = up.Namespace } - tc := &seamcorev1alpha1.InfrastructureTalosCluster{} + tc := &platformv1alpha1.TalosCluster{} if err := r.Client.Get(ctx, types.NamespacedName{ Name: up.Spec.ClusterRef.Name, Namespace: clusterNS, @@ -428,7 +427,7 @@ func (r *UpgradePolicyReconciler) patchObservedTalosVersion(ctx context.Context, if apierrors.IsNotFound(err) { return nil } - return fmt.Errorf("patchObservedTalosVersion: get InfrastructureTalosCluster: %w", err) + return fmt.Errorf("patchObservedTalosVersion: get TalosCluster: %w", err) } patch := client.MergeFrom(tc.DeepCopy()) tc.Status.ObservedTalosVersion = version @@ -442,7 +441,7 @@ func (r *UpgradePolicyReconciler) isManagementCluster(ctx context.Context, up *p if clusterNS == "" { clusterNS = up.Namespace } - tc := &seamcorev1alpha1.InfrastructureTalosCluster{} + tc := &platformv1alpha1.TalosCluster{} if err := r.Client.Get(ctx, types.NamespacedName{ Name: up.Spec.ClusterRef.Name, Namespace: clusterNS, @@ -450,9 +449,9 @@ func (r *UpgradePolicyReconciler) isManagementCluster(ctx context.Context, up *p if apierrors.IsNotFound(err) { return false, nil } - return false, fmt.Errorf("isManagementCluster: get InfrastructureTalosCluster: %w", err) + return false, fmt.Errorf("isManagementCluster: get TalosCluster: %w", err) } - return tc.Spec.Role == seamcorev1alpha1.InfrastructureTalosClusterRoleManagement, nil + return tc.Spec.Role == platformv1alpha1.TalosClusterRoleManagement, nil } // addLeaderNodeEnv appends LEADER_NODE to the first container's env of a Job. diff --git a/test/e2e/ac1_mgmt_import_test.go b/test/e2e/ac1_mgmt_import_test.go index 3642d26..c1ebe1b 100644 --- a/test/e2e/ac1_mgmt_import_test.go +++ b/test/e2e/ac1_mgmt_import_test.go @@ -24,7 +24,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" - e2ehelpers "github.com/ontai-dev/seam-core/pkg/e2e" + e2ehelpers "github.com/ontai-dev/seam/pkg/e2e" ) var ( diff --git a/test/e2e/day2/pki_rotation_automation_test.go b/test/e2e/day2/pki_rotation_automation_test.go index 4a8a609..2803fe0 100644 --- a/test/e2e/day2/pki_rotation_automation_test.go +++ b/test/e2e/day2/pki_rotation_automation_test.go @@ -19,10 +19,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" ) -// pkirotationAutomationClusterName is the name of the test InfrastructureTalosCluster +// pkirotationAutomationClusterName is the name of the test TalosCluster // used for PKI rotation automation E2E tests. Configurable via env var. func pkirotationAutomationClusterName() string { if v := os.Getenv("TENANT_CLUSTER_NAME"); v != "" { @@ -42,12 +42,12 @@ var _ = Describe("PKIRotation automation", func() { clusterName := pkirotationAutomationClusterName() tenantNS := "seam-tenant-" + clusterName - // Annotate the InfrastructureTalosCluster with the rotate-pki trigger. - itc := &seamcorev1alpha1.InfrastructureTalosCluster{} + // Annotate the TalosCluster with the rotate-pki trigger. + itc := &seamplatformv1alpha1.TalosCluster{} Expect(mgmtClient.Get(mgmtCtx, client.ObjectKey{ Name: clusterName, Namespace: "seam-system", - }, itc)).To(Succeed(), "get InfrastructureTalosCluster") + }, itc)).To(Succeed(), "get TalosCluster") itcPatch := client.MergeFrom(itc.DeepCopy()) if itc.Annotations == nil { @@ -72,7 +72,7 @@ var _ = Describe("PKIRotation automation", func() { // Verify the annotation was cleared. Eventually(func() bool { - updated := &seamcorev1alpha1.InfrastructureTalosCluster{} + updated := &seamplatformv1alpha1.TalosCluster{} if err := mgmtClient.Get(mgmtCtx, client.ObjectKey{ Name: clusterName, Namespace: "seam-system", @@ -113,11 +113,11 @@ var _ = Describe("PKIRotation automation", func() { // well within the 30-day default threshold). syntheticExpiry := metav1.NewTime(time.Now().Add(5 * 24 * time.Hour)) - itc := &seamcorev1alpha1.InfrastructureTalosCluster{} + itc := &seamplatformv1alpha1.TalosCluster{} Expect(mgmtClient.Get(mgmtCtx, client.ObjectKey{ Name: clusterName, Namespace: "seam-system", - }, itc)).To(Succeed(), "get InfrastructureTalosCluster") + }, itc)).To(Succeed(), "get TalosCluster") itcStatusPatch := client.MergeFrom(itc.DeepCopy()) itc.Status.PkiExpiryDate = &syntheticExpiry @@ -150,7 +150,7 @@ var _ = Describe("PKIRotation automation", func() { } // Clear the synthetic pkiExpiryDate. - latest := &seamcorev1alpha1.InfrastructureTalosCluster{} + latest := &seamplatformv1alpha1.TalosCluster{} if err := mgmtClient.Get(ctx, client.ObjectKey{ Name: clusterName, Namespace: "seam-system", diff --git a/test/e2e/day2/pkirotation_e2e_test.go b/test/e2e/day2/pkirotation_e2e_test.go index e068676..187cc2c 100644 --- a/test/e2e/day2/pkirotation_e2e_test.go +++ b/test/e2e/day2/pkirotation_e2e_test.go @@ -7,15 +7,15 @@ package day2_e2e_test // TENANT-PKI-ROTATE -- PKIRotation CR reaches Ready=True; kubeconfig Secrets // refreshed in seam-tenant-{cluster} // TENANT-PKI-CLUSTER-REACH -- After rotation, proves ccs-dev is reachable by -// pushing a minimal single-manifest test ClusterPack and -// waiting for InfrastructurePackExecution to reach -// Succeeded=True using the refreshed kubeconfig +// pushing a minimal single-manifest test PackDelivery and +// waiting for PackExecution to reach Succeeded=True +// using the refreshed kubeconfig // // The reachability test pushes two OCI tar.gz layers (empty RBAC + single ConfigMap -// workload) to the lab registry, creates an InfrastructureClusterPack CR, and lets -// the normal wrapper/signing/conductor-execute pipeline run. Succeeded=True on the -// PackExecution proves the conductor-execute Job successfully connected to ccs-dev -// using the kubeconfig written by pkiRotateHandler. +// workload) to the lab registry, creates a PackDelivery CR, and lets the normal +// wrapper/signing/conductor-execute pipeline run. Succeeded=True on the PackExecution +// proves the conductor-execute Job successfully connected to ccs-dev using the +// kubeconfig written by pkiRotateHandler. // // Required environment variables: // MGMT_KUBECONFIG -- path to management cluster kubeconfig (all tests skip if absent) @@ -35,13 +35,18 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" ) +// GVKs for wrapper types (seam.ontai.dev/v1alpha1). MIGRATION-3.2. +var packDeliveryGVK = schema.GroupVersionKind{Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "PackDelivery"} +var packExecutionGVK = schema.GroupVersionKind{Group: "seam.ontai.dev", Version: "v1alpha1", Kind: "PackExecution"} + // pkiRotationTimeout is the time budget for a PKI rotation Job to complete. // Rotation involves a staged machineconfig apply + Talos reboot coordination. const pkiRotationTimeout = 10 * time.Minute @@ -50,7 +55,7 @@ const pkiRotationTimeout = 10 * time.Minute // including waiting for the signing loop and Kueue scheduling. const packDeployTimeout = 10 * time.Minute -// ── TENANT-PKI-ROTATE: full rotation lifecycle on import-mode cluster ───────── +// -- TENANT-PKI-ROTATE: full rotation lifecycle on import-mode cluster var _ = Describe("TENANT-PKI-ROTATE: PKIRotation on import-mode cluster", func() { It("PKIRotation CR reaches Ready=True and kubeconfig Secrets are refreshed for TENANT_CLUSTER_NAME", func() { @@ -111,10 +116,10 @@ var _ = Describe("TENANT-PKI-ROTATE: PKIRotation on import-mode cluster", func() }) }) -// ── TENANT-PKI-CLUSTER-REACH: post-rotation ClusterPack probe ──────────────── +// -- TENANT-PKI-CLUSTER-REACH: post-rotation PackDelivery probe -var _ = Describe("TENANT-PKI-CLUSTER-REACH: single-manifest ClusterPack proves cluster reachable after PKI rotation", func() { - It("minimal ClusterPack deploy to TENANT_CLUSTER_NAME reaches PackExecution Succeeded=True", func() { +var _ = Describe("TENANT-PKI-CLUSTER-REACH: single-manifest PackDelivery proves cluster reachable after PKI rotation", func() { + It("minimal PackDelivery deploy to TENANT_CLUSTER_NAME reaches PackExecution Succeeded=True", func() { cluster := tenantClusterName() tenantNS := "seam-tenant-" + cluster @@ -148,30 +153,35 @@ data: workloadDigest, err := registry.PushArtifact(mgmtCtx, repo, "workload-v1", workloadBlob) Expect(err).NotTo(HaveOccurred(), "push workload layer to registry") - // Create the ClusterPack CR. Wrapper watches ClusterPacks and creates a - // PackExecution for each entry in spec.targetClusters. + // Create the PackDelivery CR via unstructured (wrapper owns this type). + // Wrapper watches PackDeliveries and creates a PackExecution for each entry + // in spec.targetClusters. MIGRATION-3.2: was InfrastructureClusterPack. registryURL := registryAddr + "/" + repo - cp := &seamcorev1alpha1.InfrastructureClusterPack{ - ObjectMeta: metav1.ObjectMeta{ - Name: packName, - Namespace: tenantNS, - }, - Spec: seamcorev1alpha1.InfrastructureClusterPackSpec{ - Version: "v1.0.0-pki-probe", - RegistryRef: seamcorev1alpha1.InfrastructurePackRegistryRef{ - URL: registryURL, - Digest: rbacDigest, + cp := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "seam.ontai.dev/v1alpha1", + "kind": "PackDelivery", + "metadata": map[string]interface{}{ + "name": packName, + "namespace": tenantNS, + }, + "spec": map[string]interface{}{ + "version": "v1.0.0-pki-probe", + "registryRef": map[string]interface{}{ + "url": registryURL, + "digest": rbacDigest, + }, + "basePackName": "pki-probe", + "rbacDigest": rbacDigest, + "workloadDigest": workloadDigest, + "targetClusters": []interface{}{cluster}, }, - BasePackName: "pki-probe", - RBACDigest: rbacDigest, - WorkloadDigest: workloadDigest, - TargetClusters: []string{cluster}, }, } Expect(mgmtClient.Create(mgmtCtx, cp)).To(Succeed()) DeferCleanup(func() { - // Delete ClusterPack -- wrapper GC handles PackExecution and PackInstance. - latest := &seamcorev1alpha1.InfrastructureClusterPack{} + latest := &unstructured.Unstructured{} + latest.SetGroupVersionKind(packDeliveryGVK) if err := mgmtClient.Get(mgmtCtx, types.NamespacedName{ Name: packName, Namespace: tenantNS, }, latest); err == nil { @@ -179,23 +189,25 @@ data: } }) - // Wait for the management conductor signing loop to sign the ClusterPack. - // The pack-deploy flow requires status.signed=true before conductor-execute - // runs. The signing loop runs on the management cluster conductor leader. + // Wait for the management conductor signing loop to sign the PackDelivery. + // The pack-deploy flow requires status.signed=true before conductor-execute runs. Eventually(func(g Gomega) { - got := &seamcorev1alpha1.InfrastructureClusterPack{} + got := &unstructured.Unstructured{} + got.SetGroupVersionKind(packDeliveryGVK) g.Expect(mgmtClient.Get(mgmtCtx, types.NamespacedName{ Name: packName, Namespace: tenantNS, }, got)).To(Succeed()) - g.Expect(got.Status.Signed).To(BeTrue(), - "ClusterPack must be signed by the management conductor signing loop") + signed, _, _ := unstructured.NestedBool(got.Object, "status", "signed") + g.Expect(signed).To(BeTrue(), + "PackDelivery must be signed by the management conductor signing loop") }, 3*time.Minute, pollInterval).Should(Succeed()) // Wait for wrapper to create the PackExecution. - // PackExecution name convention: {clusterPackName}-{clusterName}. + // PackExecution name convention: {packDeliveryName}-{clusterName}. peName := packName + "-" + cluster Eventually(func(g Gomega) { - pe := &seamcorev1alpha1.InfrastructurePackExecution{} + pe := &unstructured.Unstructured{} + pe.SetGroupVersionKind(packExecutionGVK) g.Expect(mgmtClient.Get(mgmtCtx, types.NamespacedName{ Name: peName, Namespace: tenantNS, }, pe)).To(Succeed(), "PackExecution %s not yet created by wrapper", peName) @@ -205,28 +217,32 @@ data: // This proves conductor-execute successfully connected to ccs-dev using the // kubeconfig refreshed by pkiRotateHandler and applied the test ConfigMap. Eventually(func(g Gomega) { - pe := &seamcorev1alpha1.InfrastructurePackExecution{} + pe := &unstructured.Unstructured{} + pe.SetGroupVersionKind(packExecutionGVK) g.Expect(mgmtClient.Get(mgmtCtx, types.NamespacedName{ Name: peName, Namespace: tenantNS, }, pe)).To(Succeed()) - var succeededCond *metav1.Condition - for i := range pe.Status.Conditions { - if pe.Status.Conditions[i].Type == "Succeeded" { - succeededCond = &pe.Status.Conditions[i] + conds, _, _ := unstructured.NestedSlice(pe.Object, "status", "conditions") + var succeededStatus string + for _, c := range conds { + cm, ok := c.(map[string]interface{}) + if !ok { + continue + } + if cm["type"] == "Succeeded" { + succeededStatus, _, _ = unstructured.NestedString(cm, "status") break } } - g.Expect(succeededCond).NotTo(BeNil(), - "Succeeded condition not yet set on PackExecution %s", peName) - g.Expect(succeededCond.Status).To(Equal(metav1.ConditionTrue), + g.Expect(succeededStatus).To(Equal("True"), "PackExecution Succeeded must be True -- ccs-dev is reachable with refreshed kubeconfig") }, packDeployTimeout, pollInterval).Should(Succeed()) }) }) // buildTarGzManifest creates a tar.gz archive containing a single YAML file. -// Used to construct minimal OCI layer blobs for test ClusterPacks. +// Used to construct minimal OCI layer blobs for test PackDeliveries. func buildTarGzManifest(filename, content string) []byte { var buf bytes.Buffer gz := gzip.NewWriter(&buf) diff --git a/test/e2e/day2/suite_test.go b/test/e2e/day2/suite_test.go index 9685122..ec2364d 100644 --- a/test/e2e/day2/suite_test.go +++ b/test/e2e/day2/suite_test.go @@ -27,8 +27,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" - e2ehelpers "github.com/ontai-dev/seam-core/pkg/e2e" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" + e2ehelpers "github.com/ontai-dev/seam/pkg/e2e" ) var ( @@ -63,6 +64,7 @@ var _ = BeforeSuite(func() { scheme := runtime.NewScheme() Expect(clientgoscheme.AddToScheme(scheme)).To(Succeed()) Expect(platformv1alpha1.AddToScheme(scheme)).To(Succeed()) + Expect(seamplatformv1alpha1.AddToScheme(scheme)).To(Succeed()) Expect(seamcorev1alpha1.AddToScheme(scheme)).To(Succeed()) mgmtClient, err = client.New(cfg, client.Options{Scheme: scheme}) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index bfb0a8e..db1dc66 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -21,7 +21,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - e2ehelpers "github.com/ontai-dev/seam-core/pkg/e2e" + e2ehelpers "github.com/ontai-dev/seam/pkg/e2e" ) // Suite-level cluster clients, initialized in BeforeSuite. diff --git a/test/integration/capi/capi_lifecycle_test.go b/test/integration/capi/capi_lifecycle_test.go index 883d414..a59d521 100644 --- a/test/integration/capi/capi_lifecycle_test.go +++ b/test/integration/capi/capi_lifecycle_test.go @@ -30,10 +30,11 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" infrav1alpha1 "github.com/ontai-dev/platform/api/infrastructure/v1alpha1" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" "github.com/ontai-dev/platform/internal/controller" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) // ── helpers ────────────────────────────────────────────────────────────────── @@ -53,6 +54,9 @@ func buildCAPIScheme(t *testing.T) *runtime.Scheme { if err := infrav1alpha1.AddToScheme(s); err != nil { t.Fatalf("add infrav1alpha1 scheme: %v", err) } + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatformv1alpha1 scheme: %v", err) + } if err := seamcorev1alpha1.AddToScheme(s); err != nil { t.Fatalf("add seamcorev1alpha1 scheme: %v", err) } diff --git a/test/integration/day2/mgmt_day2_test.go b/test/integration/day2/mgmt_day2_test.go index 0edb6fe..f77ffc5 100644 --- a/test/integration/day2/mgmt_day2_test.go +++ b/test/integration/day2/mgmt_day2_test.go @@ -24,9 +24,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" "github.com/ontai-dev/platform/internal/controller" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) // ── helpers ────────────────────────────────────────────────────────────────── @@ -42,6 +43,9 @@ func buildDay2IntegrationScheme(t *testing.T) *runtime.Scheme { if err := platformv1alpha1.AddToScheme(s); err != nil { t.Fatalf("add platformv1alpha1 scheme: %v", err) } + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatformv1alpha1 scheme: %v", err) + } if err := seamcorev1alpha1.AddToScheme(s); err != nil { t.Fatalf("add seamcorev1alpha1 scheme: %v", err) } diff --git a/test/integration/day2/suite_test.go b/test/integration/day2/suite_test.go index bd402cc..23acfae 100644 --- a/test/integration/day2/suite_test.go +++ b/test/integration/day2/suite_test.go @@ -30,7 +30,8 @@ import ( infrav1alpha1 "github.com/ontai-dev/platform/api/infrastructure/v1alpha1" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) var ( @@ -52,6 +53,7 @@ func TestMain(m *testing.M) { _ = platformv1alpha1.AddToScheme(testScheme) _ = infrav1alpha1.AddToScheme(testScheme) _ = coordinationv1.AddToScheme(testScheme) + _ = seamplatformv1alpha1.AddToScheme(testScheme) _ = seamcorev1alpha1.AddToScheme(testScheme) testEnv = &envtest.Environment{ diff --git a/test/integration/day2/testdata/crds/infrastructure.ontai.dev_infrastructurerunnerconfigs.yaml b/test/integration/day2/testdata/crds/seam.ontai.dev_runnerconfigs.yaml similarity index 95% rename from test/integration/day2/testdata/crds/infrastructure.ontai.dev_infrastructurerunnerconfigs.yaml rename to test/integration/day2/testdata/crds/seam.ontai.dev_runnerconfigs.yaml index 43ea7a5..094bf6e 100644 --- a/test/integration/day2/testdata/crds/infrastructure.ontai.dev_infrastructurerunnerconfigs.yaml +++ b/test/integration/day2/testdata/crds/seam.ontai.dev_runnerconfigs.yaml @@ -4,16 +4,16 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.1 - name: infrastructurerunnerconfigs.infrastructure.ontai.dev + name: runnerconfigs.seam.ontai.dev spec: - group: infrastructure.ontai.dev + group: seam.ontai.dev names: - kind: InfrastructureRunnerConfig - listKind: InfrastructureRunnerConfigList - plural: infrastructurerunnerconfigs + kind: RunnerConfig + listKind: RunnerConfigList + plural: runnerconfigs shortNames: - - irc - singular: infrastructurerunnerconfig + - rc + singular: runnerconfig scope: Namespaced versions: - additionalPrinterColumns: @@ -27,9 +27,9 @@ spec: schema: openAPIV3Schema: description: |- - InfrastructureRunnerConfig is the seam-core CRD for Conductor agent runtime configuration. + RunnerConfig is the seam-core CRD for Conductor agent runtime configuration. Owned by seam-core; authored exclusively by the platform operator. INV-009. - conductor-schema.md. + conductor-schema.md. MIGRATION-3.8. properties: apiVersion: description: |- @@ -50,7 +50,7 @@ spec: type: object spec: description: |- - InfrastructureRunnerConfigSpec is the operator-generated operational contract for a + RunnerConfigSpec is the operator-generated operational contract for a specific cluster. Generated at runtime by platform using the runner shared library. Never human-authored. INV-009, INV-010. conductor-schema.md. properties: @@ -170,7 +170,7 @@ spec: type: object status: description: |- - InfrastructureRunnerConfigStatus is written exclusively by the Conductor agent leader. + RunnerConfigStatus is written exclusively by the Conductor agent leader. CR-INV-006. properties: agentLeader: diff --git a/test/unit/controller/capi_lineage_test.go b/test/unit/controller/capi_lineage_test.go index 4183335..0edd8b6 100644 --- a/test/unit/controller/capi_lineage_test.go +++ b/test/unit/controller/capi_lineage_test.go @@ -1,7 +1,7 @@ // Package controller_test -- CAPI derived lineage label unit tests. // // Tests that SetDescendantLabels is called on all four CAPI objects created by -// reconcileCAPIPath. The DescendantReconciler in seam-core reads these labels to +// reconcileCAPIPath. The DescendantReconciler in seam reads these labels to // append DescendantEntry records to the TalosCluster InfrastructureLineageIndex. // PLATFORM-BL-CAPI-DERIVED-LINEAGE. package controller_test @@ -19,7 +19,7 @@ import ( platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" "github.com/ontai-dev/platform/internal/controller" - "github.com/ontai-dev/seam-core/pkg/lineage" + "github.com/ontai-dev/seam/pkg/lineage" ) // capiTCForLineage returns a minimal TalosCluster with CAPI enabled. diff --git a/test/unit/controller/day2_reconcilers_test.go b/test/unit/controller/day2_reconcilers_test.go index 13add20..7bd9629 100644 --- a/test/unit/controller/day2_reconcilers_test.go +++ b/test/unit/controller/day2_reconcilers_test.go @@ -20,10 +20,12 @@ import ( infrav1alpha1 "github.com/ontai-dev/platform/api/infrastructure/v1alpha1" platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" "github.com/ontai-dev/platform/internal/controller" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" + seamcorev1alpha1 "github.com/ontai-dev/seam/api/v1alpha1" ) + // fakeRecorder returns a buffered fake event recorder for use in tests. func fakeRecorder() clientevents.EventRecorder { return clientevents.NewFakeRecorder(32) @@ -43,6 +45,9 @@ func buildDay2Scheme(t *testing.T) *runtime.Scheme { if err := infrav1alpha1.AddToScheme(s); err != nil { t.Fatalf("add infrav1alpha1 scheme: %v", err) } + if err := seamplatformv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("add seamplatformv1alpha1 scheme: %v", err) + } if err := seamcorev1alpha1.AddToScheme(s); err != nil { t.Fatalf("add seamcorev1alpha1 scheme: %v", err) } @@ -64,21 +69,21 @@ func clusterRC(clusterName string, capabilities ...string) *controller.Operation return rc } -// successResultTCOR builds an InfrastructureTalosClusterOperationResult CR indicating success. -// One TCOR per cluster: named clusterRef in seam-tenant-{clusterRef}. The jobName is the +// successResultTCOR builds a ClusterLog CR indicating success. +// One ClusterLog per cluster: named clusterRef in seam-tenant-{clusterRef}. The jobName is the // Operations map key (OPERATION_RESULT_CR env value set by the platform reconciler). -func successResultTCOR(clusterRef, jobName string) *seamcorev1alpha1.InfrastructureTalosClusterOperationResult { - return &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{ +func successResultTCOR(clusterRef, jobName string) *seamplatformv1alpha1.ClusterLog { + return &seamplatformv1alpha1.ClusterLog{ ObjectMeta: metav1.ObjectMeta{Name: clusterRef, Namespace: "seam-tenant-" + clusterRef}, - Spec: seamcorev1alpha1.InfrastructureTalosClusterOperationResultSpec{ + Spec: seamplatformv1alpha1.ClusterLogSpec{ ClusterRef: clusterRef, TalosVersion: "v1.9.3", Revision: 1, - Operations: map[string]seamcorev1alpha1.TalosClusterOperationRecord{ + Operations: map[string]seamplatformv1alpha1.OperationRecord{ jobName: { Capability: "test-capability", JobRef: jobName, - Status: seamcorev1alpha1.TalosClusterResultSucceeded, + Status: seamplatformv1alpha1.ResultSucceeded, Message: "operation completed", }, }, @@ -86,21 +91,21 @@ func successResultTCOR(clusterRef, jobName string) *seamcorev1alpha1.Infrastruct } } -// failedResultTCOR builds an InfrastructureTalosClusterOperationResult CR indicating failure. -func failedResultTCOR(clusterRef, jobName, message string) *seamcorev1alpha1.InfrastructureTalosClusterOperationResult { - return &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{ +// failedResultTCOR builds a ClusterLog CR indicating failure. +func failedResultTCOR(clusterRef, jobName, message string) *seamplatformv1alpha1.ClusterLog { + return &seamplatformv1alpha1.ClusterLog{ ObjectMeta: metav1.ObjectMeta{Name: clusterRef, Namespace: "seam-tenant-" + clusterRef}, - Spec: seamcorev1alpha1.InfrastructureTalosClusterOperationResultSpec{ + Spec: seamplatformv1alpha1.ClusterLogSpec{ ClusterRef: clusterRef, TalosVersion: "v1.9.3", Revision: 1, - Operations: map[string]seamcorev1alpha1.TalosClusterOperationRecord{ + Operations: map[string]seamplatformv1alpha1.OperationRecord{ jobName: { Capability: "test-capability", JobRef: jobName, - Status: seamcorev1alpha1.TalosClusterResultFailed, + Status: seamplatformv1alpha1.ResultFailed, Message: message, - FailureReason: &seamcorev1alpha1.TalosClusterOperationFailureReason{ + FailureReason: &seamplatformv1alpha1.OperationFailureReason{ Category: "ExecutionFailure", Reason: message, }, diff --git a/test/unit/controller/taloscluster_lifecycle_test.go b/test/unit/controller/taloscluster_lifecycle_test.go index 9d3d9b6..9aafe09 100644 --- a/test/unit/controller/taloscluster_lifecycle_test.go +++ b/test/unit/controller/taloscluster_lifecycle_test.go @@ -318,7 +318,7 @@ func TestTalosClusterReconcile_ManagementBootstrapJobSubmitted(t *testing.T) { } // TestTalosClusterReconcile_ManagementBootstrapComplete verifies that when the -// InfrastructureTalosClusterOperationResult CR reports status=Succeeded, the reconciler +// ClusterLog CR reports status=Succeeded, the reconciler // transitions the TalosCluster to Ready=True and clears the Bootstrapping condition. // platform-design.md §5. func TestTalosClusterReconcile_ManagementBootstrapComplete(t *testing.T) { diff --git a/test/unit/controller/taloscluster_tenant_onboarding_test.go b/test/unit/controller/taloscluster_tenant_onboarding_test.go index 9c0fa62..0c8225d 100644 --- a/test/unit/controller/taloscluster_tenant_onboarding_test.go +++ b/test/unit/controller/taloscluster_tenant_onboarding_test.go @@ -1,7 +1,7 @@ // Package controller_test tests the tenant-cluster onboarding helpers. // // These tests verify the bootstrap-window RBAC setup (EnsureRemoteConductorRBAC) -// and the InfrastructureTalosCluster CR copy (EnsureRemoteTalosClusterCopy) that +// and the TalosCluster CR copy (EnsureRemoteTalosClusterCopy) that // platform applies to the tenant cluster as part of the T-19 import path. // // Both functions operate against remote cluster clients (kubernetes.Interface and @@ -70,7 +70,7 @@ func TestEnsureRemoteConductorRBAC_CreatesClusterRoleAndBinding(t *testing.T) { if group == "infrastructure.ontai.dev" { hasInfra = true } - if group == "security.ontai.dev" { + if group == "guardian.ontai.dev" { hasSecurity = true } if group == "coordination.k8s.io" { @@ -85,7 +85,7 @@ func TestEnsureRemoteConductorRBAC_CreatesClusterRoleAndBinding(t *testing.T) { t.Error("ClusterRole missing infrastructure.ontai.dev API group rule") } if !hasSecurity { - t.Error("ClusterRole missing security.ontai.dev API group rule") + t.Error("ClusterRole missing guardian.ontai.dev API group rule") } if !hasCoordination { t.Error("ClusterRole missing coordination.k8s.io API group rule (required for leader election leases)") @@ -176,7 +176,7 @@ func TestEnsureRemoteConductorRBAC_LabelsManagedByPlatform(t *testing.T) { // --- EnsureRemoteTalosClusterCopy tests --- // TestEnsureRemoteTalosClusterCopy_CreatesCR verifies that EnsureRemoteTalosClusterCopy -// creates an InfrastructureTalosCluster CR in ont-system on the tenant cluster with +// creates a TalosCluster CR in ont-system on the tenant cluster with // the spec fields copied from the management cluster TalosCluster. Decision H. func TestEnsureRemoteTalosClusterCopy_CreatesCR(t *testing.T) { tc := buildTenantTC("ccs-dev") @@ -187,9 +187,9 @@ func TestEnsureRemoteTalosClusterCopy_CreatesCR(t *testing.T) { } gvr := schema.GroupVersionResource{ - Group: "infrastructure.ontai.dev", + Group: "seam.ontai.dev", Version: "v1alpha1", - Resource: "infrastructuretalosclusters", + Resource: "talosclusters", } obj, err := dynClient.Resource(gvr).Namespace("ont-system").Get(context.Background(), "ccs-dev", metav1.GetOptions{}) if err != nil { @@ -238,9 +238,9 @@ func TestEnsureRemoteTalosClusterCopy_Idempotent(t *testing.T) { } // TestEnsureRemoteTalosClusterCopy_CRDNotYetInstalled verifies that if the -// InfrastructureTalosCluster CRD is not yet installed on the tenant cluster +// TalosCluster CRD is not yet installed on the tenant cluster // (dynamic client returns NotFound on Create), the function returns nil rather -// than an error. SC-INV-003: seam-core enable bundle may not yet be applied. +// than an error. SC-INV-003: seam enable bundle may not yet be applied. // The next reconcile retries when the CRD is available. func TestEnsureRemoteTalosClusterCopy_CRDNotYetInstalled(t *testing.T) { tc := buildTenantTC("ccs-dev") @@ -249,10 +249,10 @@ func TestEnsureRemoteTalosClusterCopy_CRDNotYetInstalled(t *testing.T) { // Inject a reactor that returns NotFound for both GET and CREATE on // infrastructuretalosclusters, simulating a cluster where the CRD is absent. notFoundErr := apierrors.NewNotFound( - schema.GroupResource{Group: "infrastructure.ontai.dev", Resource: "infrastructuretalosclusters"}, + schema.GroupResource{Group: "seam.ontai.dev", Resource: "talosclusters"}, "ccs-dev", ) - dynClient.Fake.PrependReactor("*", "infrastructuretalosclusters", + dynClient.Fake.PrependReactor("*", "talosclusters", func(_ k8stesting.Action) (bool, runtime.Object, error) { return true, nil, notFoundErr }, diff --git a/test/unit/controller/taloscluster_versionupgrade_test.go b/test/unit/controller/taloscluster_versionupgrade_test.go index c816dde..b9b7bf3 100644 --- a/test/unit/controller/taloscluster_versionupgrade_test.go +++ b/test/unit/controller/taloscluster_versionupgrade_test.go @@ -23,9 +23,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" - seamcorev1alpha1 "github.com/ontai-dev/seam-core/api/v1alpha1" - platformv1alpha1 "github.com/ontai-dev/platform/api/v1alpha1" + seamplatformv1alpha1 "github.com/ontai-dev/platform/api/seam/v1alpha1" "github.com/ontai-dev/platform/internal/controller" ) @@ -91,11 +90,11 @@ func TestTalosCluster_VersionUpgrade_CreatesUpgradePolicy(t *testing.T) { t.Error("expected non-zero RequeueAfter while waiting for UpgradePolicy") } - // UpgradePolicy must exist. + // UpgradePolicy must exist in the tenant namespace (not the ITC's namespace). up := &platformv1alpha1.UpgradePolicy{} if err := c.Get(context.Background(), types.NamespacedName{ Name: "ccs-mgmt-version-upgrade", - Namespace: "seam-system", + Namespace: "seam-tenant-ccs-mgmt", }, up); err != nil { t.Fatalf("UpgradePolicy not created: %v", err) } @@ -246,12 +245,12 @@ func TestTalosCluster_VersionUpgrade_CompletesCondition(t *testing.T) { tc := buildReadyManagementCluster("ccs-mgmt", "seam-system", "v1.9.4", "v1.9.3") tc.Spec.VersionUpgrade = true - // Pre-create the UpgradePolicy in Ready=True state (simulates prior reconcile - // creating it and the upgrade completing). + // Pre-create the UpgradePolicy in the tenant namespace in Ready=True state + // (simulates prior reconcile creating it and the upgrade completing). existingUP := &platformv1alpha1.UpgradePolicy{ ObjectMeta: metav1.ObjectMeta{ Name: "ccs-mgmt-version-upgrade", - Namespace: "seam-system", + Namespace: "seam-tenant-ccs-mgmt", }, Spec: platformv1alpha1.UpgradePolicySpec{ ClusterRef: platformv1alpha1.LocalObjectRef{Name: "ccs-mgmt", Namespace: "seam-system"}, @@ -317,13 +316,13 @@ func TestTalosCluster_VersionUpgrade_CompletesCondition(t *testing.T) { // TestUpgradePolicy_PatchesObservedTalosVersion verifies that when an UpgradePolicy // for a talos upgrade completes successfully, the reconciler patches -// InfrastructureTalosCluster.status.observedTalosVersion to the target version. +// TalosCluster.status.observedTalosVersion to the target version. func TestUpgradePolicy_PatchesObservedTalosVersion(t *testing.T) { scheme := buildDay2Scheme(t) - // buildReadyManagementCluster returns a *platformv1alpha1.TalosCluster, which is a type - // alias for *seamcorev1alpha1.InfrastructureTalosCluster. patchObservedTalosVersion - // patches status on this same object. observedVersion="v1.9.3" is the pre-upgrade value. + // buildReadyManagementCluster returns a *platformv1alpha1.TalosCluster. + // patchObservedTalosVersion patches status on this same object. + // observedVersion="v1.9.3" is the pre-upgrade value. tc := buildReadyManagementCluster("ccs-mgmt", "seam-system", "v1.9.4", "v1.9.3") up := &platformv1alpha1.UpgradePolicy{ @@ -365,12 +364,12 @@ func TestUpgradePolicy_PatchesObservedTalosVersion(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - // InfrastructureTalosCluster.status.observedTalosVersion must be updated. - gotTC := &seamcorev1alpha1.InfrastructureTalosCluster{} + // TalosCluster.status.observedTalosVersion must be updated. + gotTC := &platformv1alpha1.TalosCluster{} if err := c.Get(context.Background(), types.NamespacedName{ Name: "ccs-mgmt", Namespace: "seam-system", }, gotTC); err != nil { - t.Fatalf("get InfrastructureTalosCluster: %v", err) + t.Fatalf("get TalosCluster: %v", err) } if gotTC.Status.ObservedTalosVersion != "v1.9.4" { t.Errorf("ObservedTalosVersion = %q, want v1.9.4", gotTC.Status.ObservedTalosVersion) @@ -438,8 +437,8 @@ func TestTCOR_RevisionBumpedAfterUpgrade(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - // TCOR must still exist at seam-tenant-ccs-mgmt/ccs-mgmt — never deleted. - tcor := &seamcorev1alpha1.InfrastructureTalosClusterOperationResult{} + // ClusterLog must still exist at seam-tenant-ccs-mgmt/ccs-mgmt — never deleted. + tcor := &seamplatformv1alpha1.ClusterLog{} if err := c.Get(context.Background(), types.NamespacedName{ Name: "ccs-mgmt", Namespace: "seam-tenant-ccs-mgmt", }, tcor); err != nil { @@ -459,3 +458,127 @@ func TestTCOR_RevisionBumpedAfterUpgrade(t *testing.T) { t.Errorf("Operations len = %d after revision bump, want 0", len(tcor.Spec.Operations)) } } + +// TestTalosCluster_VersionUpgrade_KubernetesOnly_CreatesKubePolicy verifies that when +// spec.versionUpgrade=true and only spec.kubernetesVersion is set (talosVersion empty), +// the reconciler creates an UpgradePolicy with UpgradeTypeKubernetes and the correct +// TargetKubernetesVersion. TargetTalosVersion must be empty. +func TestTalosCluster_VersionUpgrade_KubernetesOnly_CreatesKubePolicy(t *testing.T) { + scheme := buildDay2Scheme(t) + // talosVersion="" means no Talos upgrade requested — kubernetesVersion drives it. + tc := buildReadyManagementCluster("ccs-mgmt", "seam-system", "", "v1.9.4") + tc.Spec.KubernetesVersion = "1.32.4" + tc.Spec.VersionUpgrade = true + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tc). + WithStatusSubresource(tc). + Build() + r := &controller.TalosClusterReconciler{ + Client: c, + Scheme: scheme, + Recorder: clientevents.NewFakeRecorder(32), + } + + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "ccs-mgmt", Namespace: "seam-system"}, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected non-zero RequeueAfter while waiting for UpgradePolicy") + } + + up := &platformv1alpha1.UpgradePolicy{} + if err := c.Get(context.Background(), types.NamespacedName{ + Name: "ccs-mgmt-version-upgrade", + Namespace: "seam-tenant-ccs-mgmt", + }, up); err != nil { + t.Fatalf("UpgradePolicy not created: %v", err) + } + if up.Spec.UpgradeType != platformv1alpha1.UpgradeTypeKubernetes { + t.Errorf("UpgradeType = %q, want kubernetes", up.Spec.UpgradeType) + } + if up.Spec.TargetKubernetesVersion != "1.32.4" { + t.Errorf("TargetKubernetesVersion = %q, want 1.32.4", up.Spec.TargetKubernetesVersion) + } + if up.Spec.TargetTalosVersion != "" { + t.Errorf("TargetTalosVersion = %q, want empty for kubernetes-only upgrade", up.Spec.TargetTalosVersion) + } + + got := &platformv1alpha1.TalosCluster{} + if err := c.Get(context.Background(), types.NamespacedName{ + Name: "ccs-mgmt", Namespace: "seam-system", + }, got); err != nil { + t.Fatalf("get TalosCluster: %v", err) + } + cond := platformv1alpha1.FindCondition(got.Status.Conditions, platformv1alpha1.ConditionTypeVersionUpgradePending) + if cond == nil || cond.Status != metav1.ConditionTrue { + t.Fatal("VersionUpgradePending not set to True for kubernetes-only upgrade") + } +} + +// TestTalosCluster_VersionUpgrade_Stack_CreatesBothVersions verifies that when +// spec.versionUpgrade=true with both spec.talosVersion and spec.kubernetesVersion set, +// the reconciler creates an UpgradePolicy with UpgradeTypeStack carrying both target +// versions (sequential Talos then k8s upgrade). +func TestTalosCluster_VersionUpgrade_Stack_CreatesBothVersions(t *testing.T) { + scheme := buildDay2Scheme(t) + tc := buildReadyManagementCluster("ccs-mgmt", "seam-system", "v1.9.4", "v1.9.3") + tc.Spec.KubernetesVersion = "1.32.4" + tc.Spec.VersionUpgrade = true + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tc). + WithStatusSubresource(tc). + Build() + r := &controller.TalosClusterReconciler{ + Client: c, + Scheme: scheme, + Recorder: clientevents.NewFakeRecorder(32), + } + + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "ccs-mgmt", Namespace: "seam-system"}, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected non-zero RequeueAfter while waiting for UpgradePolicy") + } + + up := &platformv1alpha1.UpgradePolicy{} + if err := c.Get(context.Background(), types.NamespacedName{ + Name: "ccs-mgmt-version-upgrade", + Namespace: "seam-tenant-ccs-mgmt", + }, up); err != nil { + t.Fatalf("UpgradePolicy not created: %v", err) + } + if up.Spec.UpgradeType != platformv1alpha1.UpgradeTypeStack { + t.Errorf("UpgradeType = %q, want stack", up.Spec.UpgradeType) + } + if up.Spec.TargetTalosVersion != "v1.9.4" { + t.Errorf("TargetTalosVersion = %q, want v1.9.4", up.Spec.TargetTalosVersion) + } + if up.Spec.TargetKubernetesVersion != "1.32.4" { + t.Errorf("TargetKubernetesVersion = %q, want 1.32.4", up.Spec.TargetKubernetesVersion) + } + if up.Spec.ClusterRef.Name != "ccs-mgmt" { + t.Errorf("ClusterRef.Name = %q, want ccs-mgmt", up.Spec.ClusterRef.Name) + } + + got := &platformv1alpha1.TalosCluster{} + if err := c.Get(context.Background(), types.NamespacedName{ + Name: "ccs-mgmt", Namespace: "seam-system", + }, got); err != nil { + t.Fatalf("get TalosCluster: %v", err) + } + cond := platformv1alpha1.FindCondition(got.Status.Conditions, platformv1alpha1.ConditionTypeVersionUpgradePending) + if cond == nil || cond.Status != metav1.ConditionTrue { + t.Fatal("VersionUpgradePending not set to True for stack upgrade") + } +}