diff --git a/charts/hub-agent/README.md b/charts/hub-agent/README.md index 9582dda49..7cc93551b 100644 --- a/charts/hub-agent/README.md +++ b/charts/hub-agent/README.md @@ -89,39 +89,43 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen ## Parameters -| Parameter | Description | Default | -|:------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------| -| `replicaCount` | Number of hub-agent replicas to deploy | `1` | -| `image.repository` | Image repository | `ghcr.io/kubefleet-dev/kubefleet/hub-agent` | -| `image.pullPolicy` | Image pull policy | `Always` | -| `image.tag` | Image release tag (empty uses chart `appVersion`) | `""` | -| `namespace` | Namespace where this chart is installed | `fleet-system` | -| `resources` | Resource requests/limits for the container | limits: 500m CPU, 1Gi; requests: 100m CPU, 128Mi | -| `affinity` | Node affinity for hub-agent pods | `{}` | -| `tolerations` | Tolerations for hub-agent pods | `[]` | -| `logVerbosity` | Log level (klog V logs) | `5` | -| `enableWebhook` | Enable webhook server | `true` | -| `webhookServiceName` | Webhook service name | `fleetwebhook` | -| `enableGuardRail` | Enable guard rail webhook configurations | `true` | -| `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | -| `useCertManager` | Use cert-manager for webhook certificate management (requires `enableWorkload=true`) | `false` | -| `webhookCertSecretName` | Name of the Secret where cert-manager stores the certificate (required when enabled) | `unset` | -| `enableClusterInventoryAPI` | Enable cluster inventory APIs | `true` | -| `enableStagedUpdateRunAPIs` | Enable staged update run APIs | `true` | -| `enableEvictionAPIs` | Enable eviction APIs | `true` | -| `enablePprof` | Enable pprof endpoint | `true` | -| `pprofPort` | pprof server port | `6065` | -| `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | -| `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` | -| `MaxConcurrentClusterPlacement` | Max concurrent ClusterResourcePlacement operations | `100` | -| `ConcurrentResourceChangeSyncs` | Max concurrent resourceChange reconcilers | `20` | -| `logFileMaxSize` | Max log file size before rotation (optional) | `unset` | -| `MaxFleetSizeSupported` | Max number of member clusters supported | `100` | -| `forceDeleteWaitTime` | Grace period before force-deleting resources | `15m0s` | -| `clusterUnhealthyThreshold` | Threshold duration for marking a cluster unhealthy | `3m0s` | -| `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created. | `30s` | -| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot. | `15s` | -| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster. | `false` | +| Parameter | Description | Default | +|:----------|:------------|:--------| +| `replicaCount` | Number of hub-agent replicas to deploy | `1` | +| `image.repository` | Image repository | `ghcr.io/kubefleet-dev/kubefleet/hub-agent` | +| `image.pullPolicy` | Image pull policy | `Always` | +| `image.tag` | Image release tag (empty uses chart `appVersion`) | `""` | +| `namespace` | Namespace where this chart is installed | `fleet-system` | +| `resources` | Resource requests/limits for the container | limits: 500m CPU, 1Gi; requests: 100m CPU, 128Mi | +| `affinity` | Node affinity for hub-agent pods | `{}` | +| `tolerations` | Tolerations for hub-agent pods | `[]` | +| `logVerbosity` | Log level (klog V logs) | `5` | +| `enableWebhook` | Enable webhook server | `true` | +| `webhookServiceName` | Webhook service name | `fleetwebhook` | +| `enableGuardRail` | Enable guard rail webhook configurations | `true` | +| `webhookClientConnectionType` | Connection type for webhook client (service or url) | `service` | +| `useCertManager` | Use cert-manager for webhook certificate management (requires `enableWorkload=true`) | `false` | +| `webhookCertSecretName` | Name of the Secret where cert-manager stores the certificate (required when enabled) | `unset` | +| `enableClusterInventoryAPI` | Enable cluster inventory APIs | `true` | +| `enableStagedUpdateRunAPIs` | Enable staged update run APIs | `true` | +| `enableEvictionAPIs` | Enable eviction APIs | `true` | +| `enablePprof` | Enable pprof endpoint | `true` | +| `pprofPort` | pprof server port | `6065` | +| `hubAPIQPS` | QPS for fleet-apiserver (not including events/node heartbeat) | `250` | +| `hubAPIBurst` | Burst for fleet-apiserver (not including events/node heartbeat) | `1000` | +| `MaxConcurrentClusterPlacement` | Max concurrent ClusterResourcePlacement operations | `100` | +| `ConcurrentResourceChangeSyncs` | Max concurrent resourceChange reconcilers | `20` | +| `logFileMaxSize` | Max log file size before rotation (optional) | `unset` | +| `MaxFleetSizeSupported` | Max number of member clusters supported | `100` | +| `forceDeleteWaitTime` | Grace period before force-deleting resources | `15m0s` | +| `clusterUnhealthyThreshold` | Threshold duration for marking a cluster unhealthy | `3m0s` | +| `resourceSnapshotCreationMinimumInterval` | The minimum interval at which resource snapshots could be created | `30s` | +| `resourceChangesCollectionDuration` | The duration for collecting resource changes into one snapshot | `15s` | +| `enableWorkload` | Enable kubernetes builtin workload to run in hub cluster | `false` | +| `additionalConfigData` | Additional key-value data to include in the hub agent config map | `{}` | +| `additionalConfigDataMountPath` | Mount path for the additional config data volume | `/etc/kubefleet/additional-config` | +| `enableAdmissionPolicyManager` | Enable the admission policy manager to enforce VAP-based policies on the hub cluster | `false` | +| `admissionPolicyManagerConfigName` | Name of the key that contains the admission policy manager configuration in the hub agent config map | `""` | ## Certificate Management diff --git a/charts/hub-agent/templates/config.yaml b/charts/hub-agent/templates/config.yaml new file mode 100644 index 000000000..27c2f37e2 --- /dev/null +++ b/charts/hub-agent/templates/config.yaml @@ -0,0 +1,11 @@ +{{- if .Values.additionalConfigData }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hub-agent.fullname" . }}-config + namespace: {{ .Values.namespace }} + labels: + {{- include "hub-agent.labels" . | nindent 4 }} +data: + {{- .Values.additionalConfigData | toYaml | nindent 2 }} +{{- end }} diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index e3d115344..d6c39fbc1 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -71,6 +71,13 @@ spec: - --cluster-unhealthy-threshold={{ .Values.clusterUnhealthyThreshold }} - --resource-snapshot-creation-minimum-interval={{ .Values.resourceSnapshotCreationMinimumInterval }} - --resource-changes-collection-duration={{ .Values.resourceChangesCollectionDuration }} + - --enable-admission-policy-manager={{ .Values.enableAdmissionPolicyManager }} + {{- if and .Values.admissionPolicyManagerConfigName (not .Values.additionalConfigData) }} + {{- fail "ERROR: admissionPolicyManagerConfigName is set but additionalConfigData is empty; must provide admission policy manager configuration data" }} + {{- end }} + {{- if and .Values.additionalConfigData .Values.admissionPolicyManagerConfigName }} + - --admission-policy-manager-config={{ .Values.additionalConfigDataMountPath }}/{{ .Values.admissionPolicyManagerConfigName }} + {{- end }} ports: - name: metrics containerPort: 8080 @@ -105,6 +112,11 @@ spec: # This path must match FleetWebhookCertDir in pkg/webhook/webhook.go mountPath: /tmp/k8s-webhook-server/serving-certs readOnly: true + {{- if .Values.additionalConfigData }} + - name: additional-config + mountPath: {{ .Values.additionalConfigDataMountPath }} + readOnly: true + {{- end }} {{- else }} volumeMounts: - name: webhook-cert @@ -113,6 +125,11 @@ spec: # the read only root filesystem setup would block the agent from attempting to # clear the directory. mountPath: /tmp/k8s-webhook-server/ + {{- if .Values.additionalConfigData }} + - name: additional-config + mountPath: {{ .Values.additionalConfigDataMountPath }} + readOnly: true + {{- end }} {{- end }} volumes: - name: webhook-cert @@ -125,6 +142,11 @@ spec: {{- else }} emptyDir: {} {{- end }} + {{- if .Values.additionalConfigData }} + - name: additional-config + configMap: + name: {{ include "hub-agent.fullname" . }}-config + {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index fd24c9182..04bca34a7 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -66,3 +66,16 @@ hubAPIBurst: 1000 MaxConcurrentClusterPlacement: 100 ConcurrentResourceChangeSyncs: 20 MaxFleetSizeSupported: 100 + +additionalConfigData: {} +additionalConfigDataMountPath: /etc/kubefleet/additional-config + +# Local chart specific changes: +# Admission policy manager is enabled by default in the managed solution; the +# chart default value is overridden here to verify this behavior. +enableAdmissionPolicyManager: true + +# Local chart specific changes: +# Due to release complications, the admission policy manager in the managed solution +# always use the default configuration as defined in the code. +admissionPolicyManagerConfigName: "" diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index c41407496..97fc44dfb 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -27,12 +27,14 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/klog/v2" clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -40,17 +42,18 @@ import ( ctrlwebhook "sigs.k8s.io/controller-runtime/pkg/webhook" fleetnetworkingv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" + "go.goms.io/fleet/pkg/webhook/managedresource" clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" "go.goms.io/fleet/cmd/hubagent/options" "go.goms.io/fleet/cmd/hubagent/workload" + "go.goms.io/fleet/pkg/admissionpolicymanager" mcv1beta1 "go.goms.io/fleet/pkg/controllers/membercluster/v1beta1" readiness "go.goms.io/fleet/pkg/utils/informer/readiness" "go.goms.io/fleet/pkg/utils/validator" "go.goms.io/fleet/pkg/webhook" - "go.goms.io/fleet/pkg/webhook/managedresource" // +kubebuilder:scaffold:imports ) @@ -161,7 +164,7 @@ func main() { exitWithErrorFunc() } - klog.V(2).InfoS("starting hubagent") + klog.V(2).InfoS("starting hub agent") if opts.FeatureFlags.EnableV1Beta1APIs { klog.Info("Setting up memberCluster v1beta1 controller") if err = (&mcv1beta1.Reconciler{ @@ -184,7 +187,7 @@ func main() { exitWithErrorFunc() } - if opts.WebhookOpts.EnableWebhooks { + if opts.WebhookAndAdmissionPolicyOpts.EnableWebhooks { // Generate webhook configuration with certificates webhookConfig, err := webhook.NewWebhookConfigFromOptions(mgr, opts, FleetWebhookPort) if err != nil { @@ -209,7 +212,7 @@ func main() { // When using cert-manager, add a readiness check to ensure CA bundles are injected before marking ready. // This prevents the pod from accepting traffic before cert-manager has populated the webhook CA bundles, // which would cause webhook calls to fail. - if opts.WebhookOpts.UseCertManager { + if opts.WebhookAndAdmissionPolicyOpts.UseCertManager { if err := mgr.AddReadyzCheck("cert-manager-ca-injection", func(req *http.Request) error { return webhookConfig.CheckCAInjection(req.Context()) }); err != nil { @@ -221,6 +224,45 @@ func main() { } ctx := ctrl.SetupSignalHandler() + + if opts.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager { + policyManagerCfgs := admissionpolicymanager.DefaultPolicyGeneratorConfigs + if len(opts.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig) != 0 { + // Read user-provided admission policy manager config from given path. + cfgData, err := os.ReadFile(opts.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig) + if err != nil { + klog.ErrorS(err, "failed to read the admission policy manager config file") + exitWithErrorFunc() + } + + policyManagerCfgs = &admissionpolicymanager.PolicyGeneratorConfigs{} + if err := yaml.Unmarshal(cfgData, policyManagerCfgs); err != nil { + klog.ErrorS(err, "failed to unmarshal the admission policy manager config file") + exitWithErrorFunc() + } + + // Note that validation has been performed when the flags are parsed. + } + + // Create a separate client for the admission policy manager to use, as the cached client from + // the controller manager side is not initialized yet at this point. + hubUncachedClient, err := client.New(defaultCfg, client.Options{Scheme: scheme}) + if err != nil { + klog.ErrorS(err, "failed to create uncached client for the admission policy manager") + exitWithErrorFunc() + } + policyMgr, err := admissionpolicymanager.New(hubUncachedClient, policyManagerCfgs) + if err != nil { + klog.ErrorS(err, "failed to create the admission policy manager") + exitWithErrorFunc() + } + + if err := policyMgr.Start(ctx); err != nil { + klog.ErrorS(err, "failed to start the admission policy manager") + exitWithErrorFunc() + } + } + if err := workload.SetupControllers(ctx, &wg, mgr, defaultCfg, opts); err != nil { klog.ErrorS(err, "unable to set up controllers") exitWithErrorFunc() diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index ae688a55b..26950095c 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -28,8 +28,8 @@ type Options struct { // Options that concern the setup of the controller manager instance in use by the KubeFleet hub agent. CtrlMgrOpts ControllerManagerOptions - // KubeFleet webhook related options. - WebhookOpts WebhookOptions + // KubeFleet webhook and admission policy related options. + WebhookAndAdmissionPolicyOpts WebhookAndAdmissionPolicyOptions // Feature flags that control the enabling of certain features in the hub agent. FeatureFlags FeatureFlags @@ -51,7 +51,7 @@ func NewOptions() *Options { func (o *Options) AddFlags(flags *flag.FlagSet) { o.LeaderElectionOpts.AddFlags(flags) o.CtrlMgrOpts.AddFlags(flags) - o.WebhookOpts.AddFlags(flags) + o.WebhookAndAdmissionPolicyOpts.AddFlags(flags) o.FeatureFlags.AddFlags(flags) o.ClusterMgmtOpts.AddFlags(flags) o.PlacementMgmtOpts.AddFlags(flags) diff --git a/cmd/hubagent/options/options_test.go b/cmd/hubagent/options/options_test.go index ef3082dcd..c51570339 100644 --- a/cmd/hubagent/options/options_test.go +++ b/cmd/hubagent/options/options_test.go @@ -707,13 +707,14 @@ func TestPlacementManagementOptions(t *testing.T) { } } -// TestWebhookOptions tests the parsing and validation logic of the webhook options defined in WebhookOptions. +// TestWebhookAndAdmissionPolicyOptions tests the parsing and validation logic of the webhook +// options defined in WebhookAndAdmissionPolicyOptions. func TestWebhookOptions(t *testing.T) { testCases := []struct { name string flagSetName string args []string - wantWebhookOpts WebhookOptions + wantWebhookOpts WebhookAndAdmissionPolicyOptions wantErred bool wantErrMsgSubStr string }{ @@ -721,7 +722,7 @@ func TestWebhookOptions(t *testing.T) { name: "all default", flagSetName: "allDefault", args: []string{}, - wantWebhookOpts: WebhookOptions{ + wantWebhookOpts: WebhookAndAdmissionPolicyOptions{ EnableWebhooks: true, ClientConnectionType: "url", ServiceName: "fleetwebhook", @@ -731,6 +732,8 @@ func TestWebhookOptions(t *testing.T) { EnableWorkload: false, EnablePDBs: true, UseCertManager: false, + EnableAdmissionPolicyManager: false, + AdmissionPolicyManagerConfig: "", }, }, { @@ -745,8 +748,10 @@ func TestWebhookOptions(t *testing.T) { "--deny-modify-member-cluster-labels=true", "--enable-workload=true", "--use-cert-manager=true", + "--enable-admission-policy-manager=true", + "--admission-policy-manager-config=/etc/fleet/policy-config.json", }, - wantWebhookOpts: WebhookOptions{ + wantWebhookOpts: WebhookAndAdmissionPolicyOptions{ EnableWebhooks: false, ClientConnectionType: "service", ServiceName: "customwebhook", @@ -756,13 +761,15 @@ func TestWebhookOptions(t *testing.T) { EnableWorkload: true, EnablePDBs: true, UseCertManager: true, + EnableAdmissionPolicyManager: true, + AdmissionPolicyManagerConfig: "/etc/fleet/policy-config.json", }, }, { name: "webhook client connection type URL (case-insensitive)", flagSetName: "webhookClientConnTypeURL", args: []string{"--webhook-client-connection-type=URL"}, - wantWebhookOpts: WebhookOptions{ + wantWebhookOpts: WebhookAndAdmissionPolicyOptions{ EnableWebhooks: true, ClientConnectionType: "url", ServiceName: "fleetwebhook", @@ -772,13 +779,15 @@ func TestWebhookOptions(t *testing.T) { EnableWorkload: false, EnablePDBs: true, UseCertManager: false, + EnableAdmissionPolicyManager: false, + AdmissionPolicyManagerConfig: "", }, }, { name: "webhook client connection type service (case-insensitive)", flagSetName: "webhookClientConnTypeService", args: []string{"--webhook-client-connection-type=Service"}, - wantWebhookOpts: WebhookOptions{ + wantWebhookOpts: WebhookAndAdmissionPolicyOptions{ EnableWebhooks: true, ClientConnectionType: "service", ServiceName: "fleetwebhook", @@ -788,6 +797,8 @@ func TestWebhookOptions(t *testing.T) { EnableWorkload: false, EnablePDBs: true, UseCertManager: false, + EnableAdmissionPolicyManager: false, + AdmissionPolicyManagerConfig: "", }, }, { @@ -802,8 +813,8 @@ func TestWebhookOptions(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) - webhookOpts := WebhookOptions{} - webhookOpts.AddFlags(flags) + webhookAndAdmissionPolicyOpts := WebhookAndAdmissionPolicyOptions{} + webhookAndAdmissionPolicyOpts.AddFlags(flags) err := flags.Parse(tc.args) if tc.wantErred { @@ -821,7 +832,7 @@ func TestWebhookOptions(t *testing.T) { t.Fatalf("flag Parse() = %v, want nil", err) } - if diff := cmp.Diff(webhookOpts, tc.wantWebhookOpts); diff != "" { + if diff := cmp.Diff(webhookAndAdmissionPolicyOpts, tc.wantWebhookOpts); diff != "" { t.Errorf("webhook options diff (-got, +want):\n%s", diff) } }) diff --git a/cmd/hubagent/options/validation.go b/cmd/hubagent/options/validation.go index 5770be49d..251df15c2 100644 --- a/cmd/hubagent/options/validation.go +++ b/cmd/hubagent/options/validation.go @@ -17,7 +17,12 @@ limitations under the License. package options import ( + "os" + "k8s.io/apimachinery/pkg/util/validation/field" + "k8s.io/apimachinery/pkg/util/yaml" + + "go.goms.io/fleet/pkg/admissionpolicymanager" ) // Validate checks Options and return a slice of found errs. @@ -45,12 +50,12 @@ func (o *Options) Validate() field.ErrorList { // but here the logic enforces that a service name must be provided. The way we handle // URLs is also problematic as the code will always format a service-targeted URL using // the input. We keep this logic for now for compatibility reasons. - if o.WebhookOpts.EnableWebhooks && o.WebhookOpts.ServiceName == "" { - errs = append(errs, field.Invalid(newPath.Child("WebhookServiceName"), o.WebhookOpts.ServiceName, "A webhook service name is required when webhooks are enabled")) + if o.WebhookAndAdmissionPolicyOpts.EnableWebhooks && o.WebhookAndAdmissionPolicyOpts.ServiceName == "" { + errs = append(errs, field.Invalid(newPath.Child("WebhookServiceName"), o.WebhookAndAdmissionPolicyOpts.ServiceName, "A webhook service name is required when webhooks are enabled")) } - if o.WebhookOpts.UseCertManager && !o.WebhookOpts.EnableWorkload { - errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.WebhookOpts.UseCertManager, "If cert manager is used for securing webhook connections, the EnableWorkload option must be set to true, so that cert manager pods can run in the hub cluster.")) + if o.WebhookAndAdmissionPolicyOpts.UseCertManager && !o.WebhookAndAdmissionPolicyOpts.EnableWorkload { + errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.WebhookAndAdmissionPolicyOpts.UseCertManager, "If cert manager is used for securing webhook connections, the EnableWorkload option must be set to true, so that cert manager pods can run in the hub cluster.")) } if o.PlacementMgmtOpts.AllowedPropagatingAPIs != "" && o.PlacementMgmtOpts.SkippedPropagatingAPIs != "" { @@ -66,5 +71,33 @@ func (o *Options) Validate() field.ErrorList { errs = append(errs, field.Invalid(newPath.Child("PlacementControllerWorkQueueRateLimiterOpts").Child("RateLimiterQPS"), o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterQPS, "the QPS for the placement controller set rate limiter must be less than its bucket size")) } + // Validate admission policy manager setup (if enabled). + if err := o.validateAdmissionPolicyManagerConfig(newPath); err != nil { + errs = append(errs, err) + } + return errs } + +func (o *Options) validateAdmissionPolicyManagerConfig(newPath *field.Path) *field.Error { + if o.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager { + managerConfigPath := o.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig + if len(managerConfigPath) != 0 { + // Read the file from the path. + data, err := os.ReadFile(managerConfigPath) + if err != nil { + return field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), managerConfigPath, "failed to read the admission policy manager config file: "+err.Error()) + } + + policyGeneratorConfigs := &admissionpolicymanager.PolicyGeneratorConfigs{} + if err := yaml.Unmarshal(data, policyGeneratorConfigs); err != nil { + return field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), managerConfigPath, "failed to unmarshal the admission policy manager config file: "+err.Error()) + } + + if err := policyGeneratorConfigs.Validate(); err != nil { + return field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), managerConfigPath, "invalid admission policy manager config: "+err.Error()) + } + } + } + return nil +} diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index b67d84571..edd964900 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -18,12 +18,17 @@ package options import ( "flag" + "os" + "path/filepath" "testing" "time" "github.com/google/go-cmp/cmp" "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/util/validation/field" + "sigs.k8s.io/yaml" + + "go.goms.io/fleet/pkg/admissionpolicymanager" ) const ( @@ -44,7 +49,7 @@ func newTestOptions(modifyOptions ModifyOptions) Options { LeaderElectionQPS: 250.0, LeaderElectionBurst: 1000, }, - WebhookOpts: WebhookOptions{ + WebhookAndAdmissionPolicyOpts: WebhookAndAdmissionPolicyOptions{ ClientConnectionType: "url", ServiceName: testWebhookServiceName, }, @@ -65,7 +70,7 @@ func newTestOptions(modifyOptions ModifyOptions) Options { return option } -func TestValidateControllerManagerConfiguration(t *testing.T) { +func TestValidation(t *testing.T) { newPath := field.NewPath("Options") testCases := map[string]struct { opt Options @@ -91,26 +96,26 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { }, "WebhookServiceName is empty": { opt: newTestOptions(func(option *Options) { - option.WebhookOpts.EnableWebhooks = true - option.WebhookOpts.ServiceName = "" + option.WebhookAndAdmissionPolicyOpts.EnableWebhooks = true + option.WebhookAndAdmissionPolicyOpts.ServiceName = "" }), want: field.ErrorList{field.Invalid(newPath.Child("WebhookServiceName"), "", "A webhook service name is required when webhooks are enabled")}, }, "UseCertManager without EnableWorkload": { opt: newTestOptions(func(option *Options) { - option.WebhookOpts.EnableWebhooks = true - option.WebhookOpts.ServiceName = testWebhookServiceName - option.WebhookOpts.UseCertManager = true - option.WebhookOpts.EnableWorkload = false + option.WebhookAndAdmissionPolicyOpts.EnableWebhooks = true + option.WebhookAndAdmissionPolicyOpts.ServiceName = testWebhookServiceName + option.WebhookAndAdmissionPolicyOpts.UseCertManager = true + option.WebhookAndAdmissionPolicyOpts.EnableWorkload = false }), want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "If cert manager is used for securing webhook connections, the EnableWorkload option must be set to true, so that cert manager pods can run in the hub cluster.")}, }, "UseCertManager with EnableWebhook and EnableWorkload": { opt: newTestOptions(func(option *Options) { - option.WebhookOpts.EnableWebhooks = true - option.WebhookOpts.ServiceName = testWebhookServiceName - option.WebhookOpts.UseCertManager = true - option.WebhookOpts.EnableWorkload = true + option.WebhookAndAdmissionPolicyOpts.EnableWebhooks = true + option.WebhookAndAdmissionPolicyOpts.ServiceName = testWebhookServiceName + option.WebhookAndAdmissionPolicyOpts.UseCertManager = true + option.WebhookAndAdmissionPolicyOpts.EnableWorkload = true }), want: field.ErrorList{}, }, @@ -146,6 +151,103 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { } } +func TestValidateAdmissionPolicyManagerConfig(t *testing.T) { + newPath := field.NewPath("Options") + tmpDir := t.TempDir() + + // Non-existent file: capture the OS error to mirror the exact detail string. + nonExistentPath := filepath.Join(tmpDir, "nonexistent.yaml") + _, readErr := os.ReadFile(nonExistentPath) + + // Invalid YAML file. + invalidYAMLPath := filepath.Join(tmpDir, "invalid.yaml") + if err := os.WriteFile(invalidYAMLPath, []byte("[unclosed"), 0600); err != nil { + t.Fatalf("TestValidateAdmissionPolicyManagerConfig: failed to write invalid YAML file: %v", err) + } + var dummy admissionpolicymanager.PolicyGeneratorConfigs + yamlErr := yaml.Unmarshal([]byte("[unclosed"), &dummy) + + // Config file that passes YAML parsing but fails Validate() (empty ReservedNamespacePrefixes). + invalidConfig := admissionpolicymanager.PolicyGeneratorConfigs{ + PodsAndReplicaSetsVAPGeneratorConfig: &admissionpolicymanager.PodsAndReplicaSetsValidatingAdmissionPolicyGenerator{ + ReservedNamespacePrefixes: []string{}, + }, + } + invalidConfigData, _ := yaml.Marshal(invalidConfig) + invalidConfigPath := filepath.Join(tmpDir, "invalid-config.yaml") + if err := os.WriteFile(invalidConfigPath, invalidConfigData, 0600); err != nil { + t.Fatalf("TestValidateAdmissionPolicyManagerConfig: failed to write invalid config file: %v", err) + } + configValidateErr := invalidConfig.Validate() + + // Valid config file. + validConfig := admissionpolicymanager.PolicyGeneratorConfigs{ + PodsAndReplicaSetsVAPGeneratorConfig: &admissionpolicymanager.PodsAndReplicaSetsValidatingAdmissionPolicyGenerator{ + ReservedNamespacePrefixes: []string{"fleet-", "kube-"}, + }, + } + validConfigData, _ := yaml.Marshal(validConfig) + validConfigPath := filepath.Join(tmpDir, "valid-config.yaml") + if err := os.WriteFile(validConfigPath, validConfigData, 0600); err != nil { + t.Fatalf("TestValidateAdmissionPolicyManagerConfig: failed to write valid config file: %v", err) + } + + testCases := map[string]struct { + opt Options + want field.ErrorList + }{ + "admission policy manager enabled, no config path specified": { + opt: newTestOptions(func(option *Options) { + option.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager = true + }), + want: field.ErrorList{}, + }, + "admission policy manager enabled, config file does not exist": { + opt: newTestOptions(func(option *Options) { + option.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager = true + option.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig = nonExistentPath + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), nonExistentPath, "failed to read the admission policy manager config file: "+readErr.Error()), + }, + }, + "admission policy manager enabled, config file contains invalid YAML": { + opt: newTestOptions(func(option *Options) { + option.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager = true + option.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig = invalidYAMLPath + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), invalidYAMLPath, "failed to unmarshal the admission policy manager config file: "+yamlErr.Error()), + }, + }, + "admission policy manager enabled, config file fails validation": { + opt: newTestOptions(func(option *Options) { + option.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager = true + option.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig = invalidConfigPath + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("AdmissionPolicyManagerConfig"), invalidConfigPath, "invalid admission policy manager config: "+configValidateErr.Error()), + }, + }, + "admission policy manager enabled, valid config file": { + opt: newTestOptions(func(option *Options) { + option.WebhookAndAdmissionPolicyOpts.EnableAdmissionPolicyManager = true + option.WebhookAndAdmissionPolicyOpts.AdmissionPolicyManagerConfig = validConfigPath + }), + want: field.ErrorList{}, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := tc.opt.Validate() + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("Validate() errs mismatch (-want, +got):\n%s", diff) + } + }) + } +} + func TestAddFlags(t *testing.T) { g := gomega.NewWithT(t) opts := NewOptions() @@ -153,5 +255,5 @@ func TestAddFlags(t *testing.T) { flags := flag.NewFlagSet("deny-modify-member-cluster-labels", flag.ExitOnError) opts.AddFlags(flags) - g.Expect(opts.WebhookOpts.GuardRailDenyModifyMemberClusterLabels).To(gomega.BeFalse(), "deny-modify-member-cluster-labels should be false by default") + g.Expect(opts.WebhookAndAdmissionPolicyOpts.GuardRailDenyModifyMemberClusterLabels).To(gomega.BeFalse(), "deny-modify-member-cluster-labels should be false by default") } diff --git a/cmd/hubagent/options/webhooks.go b/cmd/hubagent/options/webhooks.go index 5882b5210..510e0ae7a 100644 --- a/cmd/hubagent/options/webhooks.go +++ b/cmd/hubagent/options/webhooks.go @@ -21,9 +21,9 @@ import ( "fmt" ) -// WebhookOptions is a set of options the KubeFleet hub agent exposes for -// controlling webhook behavior. -type WebhookOptions struct { +// WebhookAndAdmissionPolicyOptions is a set of options the KubeFleet hub agent exposes for +// controlling webhook and admission policy behavior. +type WebhookAndAdmissionPolicyOptions struct { // Enable the KubeFleet webhooks or not. EnableWebhooks bool @@ -65,10 +65,29 @@ type WebhookOptions struct { // If set to false, the system will use self-signed certificates. // This option only applies if webhooks are enabled. UseCertManager bool + + // Enable the KubeFleet admission policy manager or not. + // + // KubeFleet admission policy manager manages admission policies that help enforce and validate + // certain behaviors, which serves as an in-process, more performant and more available alternative + // to some of the applicable KubeFleet validating webhooks. The manager installs and uninstalls + // admission policies when the hub agent starts. + // + // TO-DO (chenyu1): for upstream deployments, allow users to use the hub agent Helm chart to + // manage the lifecycle of the admission policies. The manager is reserved for environments + // where Helm based setup is not possible or not desired. + EnableAdmissionPolicyManager bool + + // A file path to the configuration file for the KubeFleet admission policy manager. The file + // is a YAML file that specifies configuration for each policy generator available + // in the admission policy manager. See the KubeFleet source code for more information. + // If not specified, the default configuration will be used, which enables all available + // policy generators. This option only applies if the admission policy manager is enabled. + AdmissionPolicyManagerConfig string } -// AddFlags adds flags for WebhookOptions to the specified FlagSet. -func (o *WebhookOptions) AddFlags(flags *flag.FlagSet) { +// AddFlags adds flags for WebhookAndAdmissionPolicyOptions to the specified FlagSet. +func (o *WebhookAndAdmissionPolicyOptions) AddFlags(flags *flag.FlagSet) { flags.BoolVar( &o.EnableWebhooks, "enable-webhook", @@ -131,6 +150,22 @@ func (o *WebhookOptions) AddFlags(flags *flag.FlagSet) { false, "Use the cert-manager project for managing KubeFleet webhook server certificates or not. If set to false, the system will use self-signed certificates. If set to true, the EnableWorkload option must be set to true as well. This option only applies if webhooks are enabled.", ) + + flags.BoolVar( + &o.EnableAdmissionPolicyManager, + "enable-admission-policy-manager", + // Enable the admission policy manage by default in the managed solution; this is set here + // to avoid release complications. + true, + "Enable the KubeFleet admission policy manager or not. KubeFleet admission policy manager manages admission policies that help enforce and validate certain behaviors, which serves as an in-process, more performant and more available alternative to some of the applicable KubeFleet validating webhooks.", + ) + + flags.StringVar( + &o.AdmissionPolicyManagerConfig, + "admission-policy-manager-config", + "", + "A file path to the configuration file for the KubeFleet admission policy manager. The file is a JSON or YAML file that specifies configuration for each policy generator available in the admission policy manager. See the KubeFleet source code for more information. If not specified, the default configuration will be used, which enables all available policy generators. This option only applies if the admission policy manager is enabled.", + ) } type WebhookClientConnTypeValueWithValidation string diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index b00d913e7..536ccfad0 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -166,7 +166,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, InformerManager: dynamicInformerManager, ResourceConfig: resourceConfig, SkippedNamespaces: skippedNamespaces, - EnableWorkload: opts.WebhookOpts.EnableWorkload, + EnableWorkload: opts.WebhookAndAdmissionPolicyOpts.EnableWorkload, } resourceSnapshotResolver := controller.NewResourceSnapshotResolver(mgr.GetClient(), mgr.GetScheme()) resourceSnapshotResolver.Config = controller.NewResourceSnapshotConfig(opts.PlacementMgmtOpts.ResourceSnapshotCreationMinimumInterval, opts.PlacementMgmtOpts.ResourceChangesCollectionDuration) @@ -550,7 +550,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, SkippedNamespaces: skippedNamespaces, ConcurrentPlacementWorker: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement) / 10)), ConcurrentResourceChangeWorker: opts.PlacementMgmtOpts.ConcurrentResourceChangeSyncs, - EnableWorkload: opts.WebhookOpts.EnableWorkload, + EnableWorkload: opts.WebhookAndAdmissionPolicyOpts.EnableWorkload, } if err := mgr.Add(resourceChangeDetector); err != nil { diff --git a/pkg/admissionpolicymanager/commons.go b/pkg/admissionpolicymanager/commons.go index da120f940..ac2f05226 100644 --- a/pkg/admissionpolicymanager/commons.go +++ b/pkg/admissionpolicymanager/commons.go @@ -32,6 +32,10 @@ const ( illegalCELStringChars = `'"\` ) +const ( + aksSupportUser = "aks-support" +) + var ( // reservedNamespacePrefixRegexp matches valid namespace prefix characters (DNS label subset). reservedNamespacePrefixRegexp = regexp.MustCompile(`^[a-z0-9-]+$`) diff --git a/pkg/admissionpolicymanager/configs.go b/pkg/admissionpolicymanager/configs.go index f753dc335..f81995387 100644 --- a/pkg/admissionpolicymanager/configs.go +++ b/pkg/admissionpolicymanager/configs.go @@ -17,7 +17,12 @@ limitations under the License. package admissionpolicymanager import ( + "reflect" + + "k8s.io/apimachinery/pkg/util/sets" + "go.goms.io/fleet/pkg/utils" + "go.goms.io/fleet/pkg/utils/errors" ) // PolicyGeneratorConfigs holds the configurations for all available admission policy @@ -37,5 +42,51 @@ var DefaultPolicyGeneratorConfigs = &PolicyGeneratorConfigs{ }, SvcAccountsAndTokenRequestsVAPGeneratorConfig: &ServiceAccountsAndTokenRequestsValidatingAdmissionPolicyGenerator{ ReservedNamespacePrefixes: []string{utils.FleetNSNamePrefix, utils.KubeNSNamePrefix}, + WhitelistedUsernames: []string{aksSupportUser}, }, } + +// Validate validates each generator configuration in the given PolicyGeneratorConfigs. +func (config *PolicyGeneratorConfigs) Validate() error { + if config == nil { + return nil + } + + v := reflect.ValueOf(config).Elem() + for i := 0; i < v.NumField(); i++ { + field := v.Field(i) + if field.IsNil() { + continue + } + gen, ok := field.Interface().(ValidatingAdmissionPolicyGenerator) + if !ok { + continue + } + if err := gen.Validate(); err != nil { + return errors.Wraps(err, "one of the admission policy generators is invalid", "generator", gen.Name()) + } + } + return nil +} + +// EnabledGenerators returns the set of names of generators that are enabled in the configuration. +func (config *PolicyGeneratorConfigs) EnabledGenerators() sets.Set[string] { + enabled := sets.New[string]() + if config == nil { + return enabled + } + + v := reflect.ValueOf(config).Elem() + for i := 0; i < v.NumField(); i++ { + field := v.Field(i) + if field.IsNil() { + continue + } + gen, ok := field.Interface().(ValidatingAdmissionPolicyGenerator) + if !ok { + continue + } + enabled.Insert(gen.Name()) + } + return enabled +} diff --git a/pkg/admissionpolicymanager/manager.go b/pkg/admissionpolicymanager/manager.go index 1699654fb..ce0ad3f41 100644 --- a/pkg/admissionpolicymanager/manager.go +++ b/pkg/admissionpolicymanager/manager.go @@ -97,13 +97,13 @@ type PolicyManager struct { enabledPolicyGenerators map[string]ValidatingAdmissionPolicyGenerator } -func New(client client.Client, policyGeneratorConfigs *PolicyGeneratorConfigs, enabledPolicyNames []string) (*PolicyManager, error) { +func New(client client.Client, policyGeneratorConfigs *PolicyGeneratorConfigs) (*PolicyManager, error) { if policyGeneratorConfigs == nil { klog.V(2).Info("No admission policy generator configuration provided, falling back to the default configuration") policyGeneratorConfigs = DefaultPolicyGeneratorConfigs } - // Prepare a set of generators based on the list of enabled policies. - enabledPolicyGenerators, err := preparePolicyGenerators(policyGeneratorConfigs, enabledPolicyNames) + // Prepare a set of generators based on the given configuration. + enabledPolicyGenerators, err := preparePolicyGenerators(policyGeneratorConfigs) if err != nil { return nil, errors.Wraps(err, "failed to create policy manager") } @@ -114,11 +114,7 @@ func New(client client.Client, policyGeneratorConfigs *PolicyGeneratorConfigs, e }, nil } -func preparePolicyGenerators( - policyGeneratorConfigs *PolicyGeneratorConfigs, - enabledPolicyNames []string, -) (map[string]ValidatingAdmissionPolicyGenerator, error) { - enabledPolicyNameSet := sets.New(enabledPolicyNames...) +func preparePolicyGenerators(policyGeneratorConfigs *PolicyGeneratorConfigs) (map[string]ValidatingAdmissionPolicyGenerator, error) { enabledPolicyGenerators := make(map[string]ValidatingAdmissionPolicyGenerator) v := reflect.ValueOf(policyGeneratorConfigs).Elem() @@ -131,18 +127,11 @@ func preparePolicyGenerators( if !ok { continue } - if enabledPolicyNameSet.Has(gen.Name()) { + if gen != nil { enabledPolicyGenerators[gen.Name()] = gen } } - if len(enabledPolicyNameSet) != len(enabledPolicyGenerators) { - configuredPolicyNames := make([]string, 0, len(enabledPolicyGenerators)) - for name := range enabledPolicyGenerators { - configuredPolicyNames = append(configuredPolicyNames, name) - } - return nil, errors.NewUserError(nil, "some enabled policy generators are not configured properly", "enabledPolicies", enabledPolicyNames, "configuredPolicies", configuredPolicyNames) - } return enabledPolicyGenerators, nil } diff --git a/pkg/admissionpolicymanager/manager_integration_test.go b/pkg/admissionpolicymanager/manager_integration_test.go index d2c0e28b8..4eaa2d423 100644 --- a/pkg/admissionpolicymanager/manager_integration_test.go +++ b/pkg/admissionpolicymanager/manager_integration_test.go @@ -111,7 +111,7 @@ var _ = Describe("Policies, Policy Bindings and their Effects", Ordered, func() }, Validations: []admissionregistrationv1.Validation{ { - Expression: `request.namespace.startsWith("fleet-") || request.namespace.startsWith("kube-")`, + Expression: `request.namespace.startsWith("fleet-") || request.namespace.startsWith("kube-") || (object.metadata.labels["fleet.azure.com/reconcile"] == "managed" && (request.userInfo.username == "system:serviceaccount:kube-system:deployment-controller" || request.userInfo.username == "system:serviceaccount:kube-system:replicaset-controller" || request.userInfo.username == "system:kube-controller-manager"))`, Message: "creating pods and replicas is disallowed in the fleet hub cluster", Reason: ptr.To(metav1.StatusReasonForbidden), }, @@ -166,7 +166,7 @@ var _ = Describe("Policies, Policy Bindings and their Effects", Ordered, func() }, Validations: []admissionregistrationv1.Validation{ { - Expression: `!(request.namespace.startsWith("fleet-") || request.namespace.startsWith("kube-")) || (request.userInfo.username == "system:kube-scheduler" || request.userInfo.username == "system:kube-controller-manager" || "system:nodes" in request.userInfo.groups || "system:masters" in request.userInfo.groups)`, + Expression: `!(request.namespace.startsWith("fleet-") || request.namespace.startsWith("kube-")) || (request.userInfo.username == "aks-support" || request.userInfo.username == "system:kube-scheduler" || request.userInfo.username == "system:kube-controller-manager" || "system:nodes" in request.userInfo.groups || "system:masters" in request.userInfo.groups || "kubeadm:cluster-admins" in request.userInfo.groups || "system:serviceaccounts" in request.userInfo.groups)`, Message: "writing service accounts in reserved namespaces or requesting tokens from such service accounts is disallowed", Reason: ptr.To(metav1.StatusReasonForbidden), }, diff --git a/pkg/admissionpolicymanager/podsnreplicasets.go b/pkg/admissionpolicymanager/podsnreplicasets.go index 346fd9ad2..d1b1f9e63 100644 --- a/pkg/admissionpolicymanager/podsnreplicasets.go +++ b/pkg/admissionpolicymanager/podsnreplicasets.go @@ -39,6 +39,14 @@ const ( podsAndReplicaSetsVAPPolicyBindingName = "deny-pods-and-replicasets-outside-reserved-namespaces-binding" ) +const ( + reconcileIfManagedLabelKey = "fleet.azure.com/reconcile" + reconcileIfManagedLabelValue = "managed" + + deploymentControllerUserName = "system:serviceaccount:kube-system:deployment-controller" + replicaSetControllerUserName = "system:serviceaccount:kube-system:replicaset-controller" +) + // Verify that PodsAndReplicaSetsValidatingAdmissionPolicyGenerator implements // the ValidatingAdmissionPolicyGenerator interface. var _ ValidatingAdmissionPolicyGenerator = &PodsAndReplicaSetsValidatingAdmissionPolicyGenerator{} @@ -78,6 +86,14 @@ func (g *PodsAndReplicaSetsValidatingAdmissionPolicyGenerator) PoliciesWithBindi for _, prefix := range g.ReservedNamespacePrefixes { celExprSegs = append(celExprSegs, fmt.Sprintf(`request.namespace.startsWith("%s")`, prefix)) } + + // Custom (Azure-specific) logic: allow pods and replica sets to be created if they + // have the fleet.azure.com/reconcile=managed label and they are created by the deployment or replica set + // controllers (or the controller manager, just in case per controller service account is not enabled). + hasReconcileIfManagedLabel := fmt.Sprintf(`object.metadata.labels["%s"] == "%s"`, reconcileIfManagedLabelKey, reconcileIfManagedLabelValue) + createdByControllerManager := fmt.Sprintf(`request.userInfo.username == "%s" || request.userInfo.username == "%s" || request.userInfo.username == "%s"`, deploymentControllerUserName, replicaSetControllerUserName, kubeControllerManagerUserName) + celExprSegs = append(celExprSegs, fmt.Sprintf("(%s && (%s))", hasReconcileIfManagedLabel, createdByControllerManager)) + celExpr := strings.Join(celExprSegs, " || ") policy := &admissionregistrationv1.ValidatingAdmissionPolicy{ diff --git a/pkg/admissionpolicymanager/suite_test.go b/pkg/admissionpolicymanager/suite_test.go index 45df92091..4371ceb30 100644 --- a/pkg/admissionpolicymanager/suite_test.go +++ b/pkg/admissionpolicymanager/suite_test.go @@ -94,11 +94,7 @@ var _ = BeforeSuite(func() { Expect(hubUncachedClient).ToNot(BeNil()) By("Setting up the policy manager") - enabledGeneratorNames := []string{} - for name := range allGenerators { - enabledGeneratorNames = append(enabledGeneratorNames, name) - } - policyManager, err := New(hubUncachedClient, DefaultPolicyGeneratorConfigs, enabledGeneratorNames) + policyManager, err := New(hubUncachedClient, DefaultPolicyGeneratorConfigs) Expect(err).ToNot(HaveOccurred()) Expect(policyManager).ToNot(BeNil()) Expect(policyManager.Start(ctx)).To(Succeed()) diff --git a/pkg/admissionpolicymanager/svcaccountsntokenreqs.go b/pkg/admissionpolicymanager/svcaccountsntokenreqs.go index 97874f2f9..19a1df577 100644 --- a/pkg/admissionpolicymanager/svcaccountsntokenreqs.go +++ b/pkg/admissionpolicymanager/svcaccountsntokenreqs.go @@ -44,8 +44,10 @@ const ( kubeSchedulerUserName = "system:kube-scheduler" kubeControllerManagerUserName = "system:kube-controller-manager" - kubeNodeUserGroup = "system:nodes" - adminUserGroup = "system:masters" + kubeNodeUserGroup = "system:nodes" + adminUserGroup = "system:masters" + kubeadmAdminUserGroup = "kubeadm:cluster-admins" + svcAccountUserGroup = "system:serviceaccounts" ) // Verify that ServiceAccountsAndTokenRequestsValidatingAdmissionPolicyGenerator implements @@ -119,8 +121,16 @@ func (g *ServiceAccountsAndTokenRequestsValidatingAdmissionPolicyGenerator) Poli celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`request.userInfo.username == "%s"`, kubeSchedulerUserName)) celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`request.userInfo.username == "%s"`, kubeControllerManagerUserName)) celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`"%s" in request.userInfo.groups`, kubeNodeUserGroup)) - // Exempt requests from admin users from this admission policy. + // Exempt requests from cluster admin users from this admission policy. celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`"%s" in request.userInfo.groups`, adminUserGroup)) + // Exempt kubeadm cluster admins from this policy as well, so that bootstrapping a hub cluster with + // kubeadm credentials can proceed without being blocked. + celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`"%s" in request.userInfo.groups`, kubeadmAdminUserGroup)) + // Exempt service accounts from this admission policy. Note that VAP check happens after authentication and + // authorization have been performed. This is added to keep things consistent with the original webhook behavior, + // and also for the reason that some controller manager components (e.g., the service account controller) + // need to create service accounts as part of their normal operations. + celExprAccSegs = append(celExprAccSegs, fmt.Sprintf(`"%s" in request.userInfo.groups`, svcAccountUserGroup)) celExprAcc := strings.Join(celExprAccSegs, " || ") diff --git a/pkg/controllers/updaterun/controller.go b/pkg/controllers/updaterun/controller.go index fa0c70925..1f1bf0c17 100644 --- a/pkg/controllers/updaterun/controller.go +++ b/pkg/controllers/updaterun/controller.go @@ -109,17 +109,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim return runtime.Result{}, err } - // Track errors for metrics emission. The error is used to determine the failure type - // (user_error vs internal_error) in the emitted metrics. - var reconcileErr error // Emit the update run status metric based on status conditions in the updateRun. - // Use a closure to capture reconcileErr by reference, so it reflects any updates made during reconciliation. - defer func() { emitUpdateRunStatusMetric(updateRun, reconcileErr) }() + defer emitUpdateRunStatusMetric(updateRun) state := updateRun.GetUpdateRunSpec().State var updatingStageIndex int var toBeUpdatedBindings, toBeDeletedBindings []placementv1beta1.BindingObj + var reconcileErr error updateRunStatus := updateRun.GetUpdateRunStatus() initCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index 2cf0a95d4..96c89a95c 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -91,7 +91,7 @@ func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv if apierrors.IsNotFound(err) { placementNotFoundErr := controller.NewUserError(fmt.Errorf("parent placement not found")) klog.ErrorS(err, "Failed to get placement", "placement", placementKey, "updateRun", updateRunRef) - return nil, types.NamespacedName{}, fmt.Errorf("%w: %w", errValidationFailed, placementNotFoundErr) + return nil, types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, placementNotFoundErr.Error()) } klog.ErrorS(err, "Failed to get placement", "placement", placementKey, "updateRun", updateRunRef) return nil, types.NamespacedName{}, controller.NewAPIServerError(true, err) @@ -106,7 +106,7 @@ func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv if placementSpec.Strategy.Type != placementv1beta1.ExternalRolloutStrategyType { klog.V(2).InfoS("The placement does not have an external rollout strategy", "placement", placementKey, "updateRun", updateRunRef) wrongRolloutTypeErr := controller.NewUserError(errors.New("parent placement does not have an external rollout strategy, current strategy: " + string(placementSpec.Strategy.Type))) - return nil, types.NamespacedName{}, fmt.Errorf("%w: %w", errValidationFailed, wrongRolloutTypeErr) + return nil, types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, wrongRolloutTypeErr.Error()) } updateRunStatus := updateRun.GetUpdateRunStatus() @@ -271,7 +271,7 @@ func (r *Reconciler) generateStagesByStrategy( if apierrors.IsNotFound(err) { // we won't continue or retry the initialization if the UpdateStrategy is not found. strategyNotFoundErr := controller.NewUserError(fmt.Errorf("referenced updateStrategy not found: `%s`", strategyKey)) - return fmt.Errorf("%w: %w", errValidationFailed, strategyNotFoundErr) + return fmt.Errorf("%w: %s", errValidationFailed, strategyNotFoundErr.Error()) } // other err can be retried. return controller.NewAPIServerError(true, err) @@ -338,13 +338,13 @@ func (r *Reconciler) computeRunStageStatus( klog.ErrorS(err, "Failed to validate the before stage tasks", "updateStrategy", strategyKey, "stageName", stage.Name, "updateRun", updateRunRef) // no more retries here. invalidBeforeStageErr := controller.NewUserError(fmt.Errorf("the before stage tasks are invalid, updateStrategy: `%s`, stage: %s, err: %s", strategyKey, stage.Name, err.Error())) - return fmt.Errorf("%w: %w", errValidationFailed, invalidBeforeStageErr) + return fmt.Errorf("%w: %s", errValidationFailed, invalidBeforeStageErr.Error()) } if err := validateAfterStageTask(stage.AfterStageTasks); err != nil { klog.ErrorS(err, "Failed to validate the after stage tasks", "updateStrategy", strategyKey, "stageName", stage.Name, "updateRun", updateRunRef) // no more retries here. invalidAfterStageErr := controller.NewUserError(fmt.Errorf("the after stage tasks are invalid, updateStrategy: `%s`, stage: %s, err: %s", strategyKey, stage.Name, err.Error())) - return fmt.Errorf("%w: %w", errValidationFailed, invalidAfterStageErr) + return fmt.Errorf("%w: %s", errValidationFailed, invalidAfterStageErr.Error()) } curStageUpdatingStatus := placementv1beta1.StageUpdatingStatus{StageName: stage.Name} @@ -354,7 +354,7 @@ func (r *Reconciler) computeRunStageStatus( klog.ErrorS(err, "Failed to convert label selector", "updateStrategy", strategyKey, "stageName", stage.Name, "labelSelector", stage.LabelSelector, "updateRun", updateRunRef) // no more retries here. invalidLabelErr := controller.NewUserError(fmt.Errorf("the stage label selector is invalid, updateStrategy: `%s`, stage: %s, err: %s", strategyKey, stage.Name, err.Error())) - return fmt.Errorf("%w: %w", errValidationFailed, invalidLabelErr) + return fmt.Errorf("%w: %s", errValidationFailed, invalidLabelErr.Error()) } // List all the clusters that match the label selector. var clusterList clusterv1beta1.MemberClusterList @@ -372,7 +372,7 @@ func (r *Reconciler) computeRunStageStatus( dupErr := controller.NewUserError(fmt.Errorf("cluster `%s` appears in more than one stages", cluster.Name)) klog.ErrorS(dupErr, "Failed to compute the stage", "updateStrategy", strategyKey, "stageName", stage.Name, "updateRun", updateRunRef) // no more retries here. - return fmt.Errorf("%w: %w", errValidationFailed, dupErr) + return fmt.Errorf("%w: %s", errValidationFailed, dupErr.Error()) } if stage.SortingLabelKey != nil { // interpret the label values as integers. @@ -380,7 +380,7 @@ func (r *Reconciler) computeRunStageStatus( keyErr := controller.NewUserError(fmt.Errorf("the sorting label `%s:%s` on cluster `%s` is not valid: %s", *stage.SortingLabelKey, cluster.Labels[*stage.SortingLabelKey], cluster.Name, err.Error())) klog.ErrorS(keyErr, "Failed to sort clusters in the stage", "updateStrategy", strategyKey, "stageName", stage.Name, "updateRun", updateRunRef) // no more retries here. - return fmt.Errorf("%w: %w", errValidationFailed, keyErr) + return fmt.Errorf("%w: %s", errValidationFailed, keyErr.Error()) } } curStageClusters = append(curStageClusters, cluster) @@ -447,7 +447,7 @@ func (r *Reconciler) computeRunStageStatus( sort.Strings(missingClusters) klog.ErrorS(missingErr, "Clusters are missing in any stage", "clusters", strings.Join(missingClusters, ", "), "updateStrategy", strategyKey, "updateRun", updateRunRef) // no more retries here, only show the first 10 missing clusters in the placement status. - return fmt.Errorf("%w: %w, total %d, showing up to 10: %s", errValidationFailed, missingErr, len(missingClusters), strings.Join(missingClusters[:min(10, len(missingClusters))], ", ")) + return fmt.Errorf("%w: %s, total %d, showing up to 10: %s", errValidationFailed, missingErr.Error(), len(missingClusters), strings.Join(missingClusters[:min(10, len(missingClusters))], ", ")) } return nil } @@ -564,7 +564,7 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement plac userErr := controller.NewUserError(fmt.Errorf("invalid resource snapshot index `%s` provided, expected an integer >= 0", updateRunSpec.ResourceSnapshotIndex)) klog.ErrorS(userErr, "Failed to parse the resource snapshot index", "updateRun", updateRunRef) // no more retries here. - return nil, fmt.Errorf("%w: %w", errValidationFailed, userErr) + return nil, fmt.Errorf("%w: %s", errValidationFailed, userErr.Error()) } resourceSnapshotList, err := controller.ListAllResourceSnapshotWithAnIndex(ctx, r.Client, updateRunSpec.ResourceSnapshotIndex, placementKey.Name, placementKey.Namespace) @@ -580,7 +580,7 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement plac userErr := controller.NewUserError(fmt.Errorf("no resourceSnapshots with index `%d` found for placement `%s`", snapshotIndex, placementKey)) klog.ErrorS(userErr, "No specified resourceSnapshots found", "updateRun", updateRunRef) // no more retries here. - return resourceSnapshotObjs, fmt.Errorf("%w: %w", errValidationFailed, userErr) + return resourceSnapshotObjs, fmt.Errorf("%w: %s", errValidationFailed, userErr.Error()) } return resourceSnapshotObjs, nil } @@ -592,7 +592,7 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement plac if err != nil { klog.ErrorS(err, "Failed to select resources for placement", "placement", placementKey, "updateRun", updateRunRef) if errors.Is(err, controller.ErrUserError) { - return nil, fmt.Errorf("%w: %w", errValidationFailed, err) + return nil, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) } return nil, err } diff --git a/pkg/controllers/updaterun/metrics.go b/pkg/controllers/updaterun/metrics.go index 24cad3312..384925529 100644 --- a/pkg/controllers/updaterun/metrics.go +++ b/pkg/controllers/updaterun/metrics.go @@ -17,11 +17,12 @@ limitations under the License. package updaterun import ( - "errors" + "strings" "time" "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" @@ -37,35 +38,58 @@ func deleteUpdateRunMetrics(updateRun placementv1beta1.UpdateRunObj) { hubmetrics.FleetUpdateRunApprovalRequestLatencySeconds.DeletePartialMatch(prometheus.Labels{"namespace": updateRun.GetNamespace(), "name": updateRun.GetName()}) } -// determineFailureType determines the type of failure based on the condition status and error. +// determineFailureType determines the type of failure based on the condition. // It returns: -// - "none" for successful, in-progress, waiting, or stopping conditions -// - "user_error" for known customer configuration errors (when err wraps controller.ErrUserError) -// - "internal_error" for errors that require investigation -func determineFailureType(err error) hubmetrics.UpdateRunFailureType { - if err != nil { - // Check if error for the failed condition is a user error. - if errors.Is(err, controller.ErrUserError) { +// - "none" for successful, in-progress, waiting, stopping (unknown status), or stopped conditions +// - "user_error" for known customer configuration errors (when the condition message +// contains the user error marker) +// - "internal_error" for terminal failure conditions or stuck conditions that require investigation +// +// The failure type is derived from the condition message to ensure consistency across +// reconciliations. This handles subsequent reconciliations where the original error +// is not available but the failure type needs to be preserved in the condition message. +// +// Note: A "stuck" update run is classified as "internal_error" because it may never resolve +// and effectively represents a failure state that needs investigation. +func determineFailureType(cond *metav1.Condition) hubmetrics.UpdateRunFailureType { + if cond == nil || cond.Status != metav1.ConditionFalse { + return hubmetrics.UpdateRunFailureTypeNone + } + + // Stuck is always classified as internal error + if cond.Reason == condition.UpdateRunStuckReason { + return hubmetrics.UpdateRunFailureTypeInternalError + } + + // Check if it's a terminal failure condition + if isFailureReason(cond.Reason) { + if strings.Contains(cond.Message, controller.ErrUserError.Error()) { return hubmetrics.UpdateRunFailureTypeUserError } - // Failed condition that is not a user error is an internal error. return hubmetrics.UpdateRunFailureTypeInternalError } - // If there's no error, there's no failure to categorize. return hubmetrics.UpdateRunFailureTypeNone } +// isFailureReason returns true if the condition reason indicates a terminal failure state +// for an UpdateRun. Non-failure reasons like stuck, waiting, stopping, or stopped are +// not considered terminal failures. +func isFailureReason(reason string) bool { + return reason == condition.UpdateRunFailedReason || + reason == condition.UpdateRunInitializeFailedReason +} + // emitUpdateRunStatusMetric emits the update run status metric based on status conditions in the updateRun. -// The err parameter is used to determine the failure type for failed conditions. -func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj, err error) { +// The failure type is derived from the condition message, not from the reconcile error. +func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj) { generation := updateRun.GetGeneration() state := updateRun.GetUpdateRunSpec().State updateRunStatus := updateRun.GetUpdateRunStatus() - failureType := determineFailureType(err) succeedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionSucceeded)) if succeedCond != nil && succeedCond.ObservedGeneration == generation { + failureType := determineFailureType(succeedCond) hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), string(placementv1beta1.StagedUpdateRunConditionSucceeded), string(succeedCond.Status), succeedCond.Reason, string(failureType)).SetToCurrentTime() return @@ -73,6 +97,7 @@ func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj, err erro progressingCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionProgressing)) if progressingCond != nil && progressingCond.ObservedGeneration == generation { + failureType := determineFailureType(progressingCond) hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing), string(progressingCond.Status), progressingCond.Reason, string(failureType)).SetToCurrentTime() return @@ -80,6 +105,7 @@ func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj, err erro initializedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) if initializedCond != nil && initializedCond.ObservedGeneration == generation { + failureType := determineFailureType(initializedCond) hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), string(placementv1beta1.StagedUpdateRunConditionInitialized), string(initializedCond.Status), initializedCond.Reason, string(failureType)).SetToCurrentTime() return diff --git a/pkg/controllers/updaterun/metrics_test.go b/pkg/controllers/updaterun/metrics_test.go index 7415d002c..0e1ac23ed 100644 --- a/pkg/controllers/updaterun/metrics_test.go +++ b/pkg/controllers/updaterun/metrics_test.go @@ -17,50 +17,185 @@ limitations under the License. package updaterun import ( - "errors" "fmt" "testing" "github.com/google/go-cmp/cmp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "go.goms.io/fleet/apis/placement/v1beta1" hubmetrics "go.goms.io/fleet/pkg/metrics/hub" + "go.goms.io/fleet/pkg/utils/condition" "go.goms.io/fleet/pkg/utils/controller" ) func TestDetermineFailureType(t *testing.T) { tests := []struct { name string - err error + cond *metav1.Condition want hubmetrics.UpdateRunFailureType }{ { - name: "update run no failure", - err: nil, + name: "nil condition", + cond: nil, want: hubmetrics.UpdateRunFailureTypeNone, }, { - name: "update run failed with user error", - err: fmt.Errorf("cannot continue the updateRun: failed to validate the updateRun: %w", controller.ErrUserError), + name: "succeeded condition status is true", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionSucceeded), + Status: metav1.ConditionTrue, + Reason: condition.UpdateRunSucceededReason, + Message: "update run succeeded", + }, + want: hubmetrics.UpdateRunFailureTypeNone, + }, + { + name: "progressing condition is false with reason stuck - internal error", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunStuckReason, + Message: "updateRun is stuck waiting for 1 cluster(s) in stage stage1 to finish updating", + }, + want: hubmetrics.UpdateRunFailureTypeInternalError, + }, + { + name: "progressing condition is false but reason is waiting - not a failure", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunWaitingReason, + Message: "waiting for approval", + }, + want: hubmetrics.UpdateRunFailureTypeNone, + }, + { + name: "progressing condition is unknown but reason is stopping - not a failure", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionUnknown, + Reason: condition.UpdateRunStoppingReason, + Message: "stopping the update run", + }, + want: hubmetrics.UpdateRunFailureTypeNone, + }, + { + name: "progressing condition is false but reason is stopped - not a failure", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunStoppedReason, + Message: "update run has been stopped", + }, + want: hubmetrics.UpdateRunFailureTypeNone, + }, + { + name: "succeeded condition is false with UpdateRunFailed reason and user error in message", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionSucceeded), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunFailedReason, + Message: controller.NewUserError(fmt.Errorf("invalid CRP selector")).Error(), + }, want: hubmetrics.UpdateRunFailureTypeUserError, }, { - name: "update run failed with internal error", - err: errors.New("cannot continue the updateRun"), + name: "succeeded condition is false with UpdateRunFailed reason but no user error in message", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionSucceeded), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunFailedReason, + Message: "cannot continue the updateRun: some internal error", + }, + want: hubmetrics.UpdateRunFailureTypeInternalError, + }, + { + name: "progressing condition is false with unexpected error in message", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionProgressing), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunFailedReason, + Message: controller.NewUnexpectedBehaviorError(fmt.Errorf("found unsupported task type in before stage tasks: %s", v1beta1.StageTaskTypeTimedWait)).Error(), + }, want: hubmetrics.UpdateRunFailureTypeInternalError, }, { - name: "update run is stuck - internal error", - err: errors.New("updateRun is stuck waiting for 1 cluster(s) in stage stage1 to finish updating, please check placement status for potential errors"), + name: "initialized condition is false with UpdateRunInitializeFailed reason but no user error in message", + cond: &metav1.Condition{ + Type: string(v1beta1.StagedUpdateRunConditionInitialized), + Status: metav1.ConditionFalse, + Reason: condition.UpdateRunInitializeFailedReason, + Message: "failed to initialize: internal error", + }, want: hubmetrics.UpdateRunFailureTypeInternalError, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - got := determineFailureType(tc.err) + got := determineFailureType(tc.cond) if diff := cmp.Diff(tc.want, got); diff != "" { t.Errorf("determineFailureType() = %v, want %v, diff (-want +got):\n%s", got, tc.want, diff) } }) } } + +func TestIsFailureReason(t *testing.T) { + tests := []struct { + name string + reason string + want bool + }{ + { + name: "UpdateRunFailedReason is a failure", + reason: condition.UpdateRunFailedReason, + want: true, + }, + { + name: "UpdateRunInitializeFailedReason is a failure", + reason: condition.UpdateRunInitializeFailedReason, + want: true, + }, + { + name: "UpdateRunStuckReason is not a terminal failure", + reason: condition.UpdateRunStuckReason, + want: false, + }, + { + name: "UpdateRunWaitingReason is not a failure", + reason: condition.UpdateRunWaitingReason, + want: false, + }, + { + name: "UpdateRunStoppingReason is not a failure", + reason: condition.UpdateRunStoppingReason, + want: false, + }, + { + name: "UpdateRunStoppedReason is not a failure", + reason: condition.UpdateRunStoppedReason, + want: false, + }, + { + name: "UpdateRunSucceededReason is not a failure", + reason: condition.UpdateRunSucceededReason, + want: false, + }, + { + name: "UpdateRunProgressingReason is not a failure", + reason: condition.UpdateRunProgressingReason, + want: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := isFailureReason(tc.reason) + if got != tc.want { + t.Errorf("isFailureReason(%q) = %v, want %v", tc.reason, got, tc.want) + } + }) + } +} diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go index d026b56c8..d715640fa 100644 --- a/pkg/utils/errors/errors_test.go +++ b/pkg/utils/errors/errors_test.go @@ -87,7 +87,7 @@ func TestCommonUsePatterns(t *testing.T) { outputStr = readFromBuffer(t, &bytesBuf) // The full error output looks like the follows: - // errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:github.com/kubefleet-dev/kubefleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2 + // errors_test.go:86: time=2026-04-29T02:49:04.560+10:00 level=ERROR msg="additional top/controller-level error description" err="additional high-level error description: additional low-level error description: cannot calculate resource hash" errCategory=unexpected k1=v1 callers="[{Function:go.goms.io/fleet/pkg/utils/errors.TestCommonUsePatterns File:SomeFilePath Line:74} {Function:testing.tRunner File:SomeFilePath Line:1934} {Function:runtime.goexit File:SomeFilePath Line:1268}]" k2=v2 wantSubStrings = []string{ "msg=\"additional top/controller-level error description\"", "err=\"additional high-level error description: additional low-level error description: cannot calculate resource hash\"", diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index b7febe5af..ed84fd4d1 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -245,20 +245,20 @@ func NewWebhookConfig( // String-to-enum conversions (e.g., WebhookClientConnectionType) are performed without // additional validation, as validation happens at the Options level. func NewWebhookConfigFromOptions(mgr manager.Manager, opts *options.Options, webhookPort int32) (*Config, error) { - webhookClientConnectionType := options.WebhookClientConnectionType(opts.WebhookOpts.ClientConnectionType) - whiteListedUsers := strings.Split(opts.WebhookOpts.GuardRailWhitelistedUsers, ",") + webhookClientConnectionType := options.WebhookClientConnectionType(opts.WebhookAndAdmissionPolicyOpts.ClientConnectionType) + whiteListedUsers := strings.Split(opts.WebhookAndAdmissionPolicyOpts.GuardRailWhitelistedUsers, ",") return NewWebhookConfig( mgr, - opts.WebhookOpts.ServiceName, + opts.WebhookAndAdmissionPolicyOpts.ServiceName, webhookPort, &webhookClientConnectionType, FleetWebhookCertDir, - opts.WebhookOpts.EnableGuardRail, - opts.WebhookOpts.GuardRailDenyModifyMemberClusterLabels, - opts.WebhookOpts.EnableWorkload, - opts.WebhookOpts.EnablePDBs, - opts.WebhookOpts.UseCertManager, + opts.WebhookAndAdmissionPolicyOpts.EnableGuardRail, + opts.WebhookAndAdmissionPolicyOpts.GuardRailDenyModifyMemberClusterLabels, + opts.WebhookAndAdmissionPolicyOpts.EnableWorkload, + opts.WebhookAndAdmissionPolicyOpts.EnablePDBs, + opts.WebhookAndAdmissionPolicyOpts.UseCertManager, FleetWebhookCertName, whiteListedUsers, opts.ClusterMgmtOpts.NetworkingAgentsEnabled) diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 4b397771d..04dc86111 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -254,7 +254,7 @@ func TestNewWebhookConfigFromOptions(t *testing.T) { }{ "valid options with cert-manager": { opts: &options.Options{ - WebhookOpts: options.WebhookOptions{ + WebhookAndAdmissionPolicyOpts: options.WebhookAndAdmissionPolicyOptions{ ServiceName: "test-webhook", ClientConnectionType: "service", EnableGuardRail: true, @@ -285,7 +285,7 @@ func TestNewWebhookConfigFromOptions(t *testing.T) { }, "valid options without cert-manager": { opts: &options.Options{ - WebhookOpts: options.WebhookOptions{ + WebhookAndAdmissionPolicyOpts: options.WebhookAndAdmissionPolicyOptions{ ServiceName: "test-webhook", ClientConnectionType: "url", EnableGuardRail: false, diff --git a/test/e2e/admission_policies_test.go b/test/e2e/admission_policies_test.go new file mode 100644 index 000000000..1f68034e0 --- /dev/null +++ b/test/e2e/admission_policies_test.go @@ -0,0 +1,217 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + appsv1 "k8s.io/api/apps/v1" + authenticationv1 "k8s.io/api/authentication/v1" + corev1 "k8s.io/api/core/v1" + k8sErrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + + "go.goms.io/fleet/pkg/admissionpolicymanager" +) + +var _ = Describe("deny service account writes and token requests in restricted namespaces via VAP", Ordered, func() { + svcAccountName := fmt.Sprintf(svcAccountNameTemplate, GinkgoParallelProcess()) + svcAccountToAddName := "added-sa" + kubeSystemNamespaceName := "kube-system" + + var svcAccount *corev1.ServiceAccount + BeforeAll(func() { + if !EnabledVAPGenerators.Has(admissionpolicymanager.SvcAccountsAndTokenRequestsVAPGeneratorName) { + Skip("VAP required for this test is not enabled; skip the test") + } + + // Create a service account in the kube-system namespace. + svcAccount = &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + Expect(hubClient.Create(ctx, svcAccount)).Should(Succeed(), "Failed to create service account") + }) + + AfterAll(func() { + // Ensure the removal of the created service account. + Eventually(func() error { + svcAccount := corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountName, + Namespace: kubeSystemNamespaceName, + }, + } + if err := hubClient.Delete(ctx, &svcAccount); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete service account: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: svcAccountName, Namespace: kubeSystemNamespaceName}, &corev1.ServiceAccount{}); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to retrieve service account: %w", err) + } + return fmt.Errorf("service account still exists") + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete service account") + }) + + It("should deny creation of service accounts in kube-system namespace for non-whitelisted users", func() { + newSvcAccount := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: svcAccountToAddName, + Namespace: kubeSystemNamespaceName, + }, + } + wantErrMsg := "writing service accounts in reserved namespaces or requesting tokens from such service accounts is disallowed" + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, newSvcAccount), wantErrMsg)).Should(Succeed(), "Failed to deny creation of service account in kube-system namespace") + }) + + It("should deny access to service account token subresource in restricted namespace for non-whitelisted users", func() { + tokenRequest := &authenticationv1.TokenRequest{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: kubeSystemNamespaceName, + Name: svcAccountName, + }, + Spec: authenticationv1.TokenRequestSpec{ + Audiences: []string{"experimental"}, + ExpirationSeconds: ptr.To(int64(3600)), + }, + } + wantErrMsg := "writing service accounts in reserved namespaces or requesting tokens from such service accounts is disallowed" + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.SubResource("token").Create(ctx, svcAccount, tokenRequest), wantErrMsg)).Should(Succeed(), "Failed to deny access to service account token subresource in restricted namespace") + }) +}) + +var _ = Describe("deny pod and replica set creation in non-reserved namespaces via VAP", Ordered, func() { + podName := "dummy-pod" + replicaSetName := "dummy-replica-set" + + BeforeAll(func() { + if !EnabledVAPGenerators.Has(admissionpolicymanager.PodsAndReplicaSetsVAPGeneratorName) { + Skip("VAP required for this test is not enabled; skip the test") + } + }) + + AfterAll(func() { + // Ensure the removal of the pod in case the test fails and the object was inadvertently created. + Eventually(func() error { + pod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: "default", + }, + } + if err := hubClient.Delete(ctx, &pod); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete pod: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: podName, Namespace: "default"}, &corev1.Pod{}); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to retrieve pod: %w", err) + } + return fmt.Errorf("pod still exists") + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete pod") + + // Ensure the removal of the replica set in case the test fails and the object was inadvertently created. + Eventually(func() error { + replicaSet := appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: replicaSetName, + Namespace: "default", + }, + } + if err := hubClient.Delete(ctx, &replicaSet); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete replica set: %w", err) + } + + if err := hubClient.Get(ctx, types.NamespacedName{Name: replicaSetName, Namespace: "default"}, &appsv1.ReplicaSet{}); err != nil { + if k8sErrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to retrieve replica set: %w", err) + } + return fmt.Errorf("replica set still exists") + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to delete replica set") + }) + + It("should deny creation of pods in the default namespace for non-whitelisted users", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: "default", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "nginx", + Image: "nginx", + }, + }, + }, + } + wantErrMsg := "creating pods and replicas is disallowed in the fleet hub cluster" + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, pod), wantErrMsg)).Should(Succeed(), "Failed to deny creation of pod in default namespace") + }) + + It("should deny creation of replica sets in the default namespace for non-whitelisted users", func() { + replicaSet := &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: replicaSetName, + Namespace: "default", + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "nginx"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "nginx"}, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "nginx", + Image: "nginx", + }, + }, + }, + }, + }, + } + wantErrMsg := "creating pods and replicas is disallowed in the fleet hub cluster" + Expect(checkIfStatusErrorWithMessage(impersonateHubClient.Create(ctx, replicaSet), wantErrMsg)).Should(Succeed(), "Failed to deny creation of replica set in default namespace") + }) +}) diff --git a/test/e2e/admission_policy_manager_cfg.yaml b/test/e2e/admission_policy_manager_cfg.yaml new file mode 100644 index 000000000..b1800759f --- /dev/null +++ b/test/e2e/admission_policy_manager_cfg.yaml @@ -0,0 +1,11 @@ +# Note: for the internal fork repo, this file is only used to signal the test +# setup which VAPs are enabled or not. The specific configurations for each +# VAP are not in use. +denyPodsAndReplicaSetsOutsideReservedNamespaces: + ReservedNamespacePrefixes: + - fleet- + - kube- +denyServiceAccountsAndTokenRequestsInReservedNamespaces: + ReservedNamespacePrefixes: + - fleet- + - kube- diff --git a/test/e2e/fleet_guard_rail_test.go b/test/e2e/fleet_guard_rail_test.go index b073ccd11..67210736e 100644 --- a/test/e2e/fleet_guard_rail_test.go +++ b/test/e2e/fleet_guard_rail_test.go @@ -40,6 +40,7 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/admissionpolicymanager" "go.goms.io/fleet/pkg/utils" "go.goms.io/fleet/pkg/webhook/validation" @@ -1619,6 +1620,10 @@ var _ = Describe("fleet guard rail webhook tests for service accounts in restric var svcAccount *corev1.ServiceAccount BeforeAll(func() { + if EnabledVAPGenerators.Has(admissionpolicymanager.SvcAccountsAndTokenRequestsVAPGeneratorName) { + Skip("A VAP that serves the same purpose is enabled; skip this test as the request will be rejected by the VAP instead") + } + // Create a service account in the kube-system namespace. svcAccount = &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e/setup_test.go b/test/e2e/setup_test.go index 56e7c4ef4..4d3b1307c 100644 --- a/test/e2e/setup_test.go +++ b/test/e2e/setup_test.go @@ -35,12 +35,14 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" k8sscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/yaml" fleetnetworkingv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1" @@ -48,6 +50,7 @@ import ( placementv1 "go.goms.io/fleet/apis/placement/v1" placementv1alpha1 "go.goms.io/fleet/apis/placement/v1alpha1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/admissionpolicymanager" "go.goms.io/fleet/pkg/propertyprovider/azure/trackers" "go.goms.io/fleet/pkg/utils" testv1alpha1 "go.goms.io/fleet/test/apis/v1alpha1" @@ -188,6 +191,15 @@ var ( memberCluster3AKSRegion = "eastasia" ) +const ( + VAPConfigFileName = "admission_policy_manager_cfg.yaml" +) + +var ( + // The set will be populated when the test suite is initialized. + EnabledVAPGenerators = sets.Set[string]{} +) + var ( lessFuncCondition = func(a, b metav1.Condition) bool { return a.Type < b.Type @@ -304,6 +316,20 @@ func TestMain(m *testing.M) { log.Fatalf("failed to add cluster inventory APIs to the runtime scheme: %v", err) } + // Read the VAP generator configuration. + // + // The configuration file is required for the hub agent to start up in the current test environment + // setup; see setup.sh for more information. + data, err := os.ReadFile(VAPConfigFileName) + if err != nil { + log.Fatalf("failed to read VAP generator configuration from %s: %v", VAPConfigFileName, err) + } + configs := &admissionpolicymanager.PolicyGeneratorConfigs{} + if err := yaml.Unmarshal(data, configs); err != nil { + log.Fatalf("failed to unmarshal VAP generator configuration from %s: %v", VAPConfigFileName, err) + } + EnabledVAPGenerators = configs.EnabledGenerators() + os.Exit(m.Run()) }