diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py index a56fe72b9..f14cc6b55 100644 --- a/perfkitbenchmarker/providers/gcp/flags.py +++ b/perfkitbenchmarker/providers/gcp/flags.py @@ -561,6 +561,62 @@ False, 'Whether to enable shielded nodes.', ) +GKE_ENABLE_PRIVATE_NODES = flags.DEFINE_boolean( + 'gke_enable_private_nodes', + False, + 'Whether to create the cluster with private nodes (nodes have only internal IPs).', +) +GKE_ENABLE_DNS_ACCESS = flags.DEFINE_boolean( + 'gke_enable_dns_access', + False, + 'Whether to enable DNS-based control plane access (replaces the public/private IP endpoint model).', +) +GKE_ENABLE_IP_ACCESS = flags.DEFINE_boolean( + 'gke_enable_ip_access', + True, + 'Whether to enable IP-based control plane access. Disabling requires DNS access and is mutually exclusive with public clusters (nodes with public IPs).', +) +GKE_MASTER_IPV4_CIDR = flags.DEFINE_string( + 'gke_master_ipv4_cidr', + None, + 'CIDR range to use for the hosted master network. Required when private nodes are enabled without DNS access.', +) +GKE_ENABLE_DATAPLANE_V2 = flags.DEFINE_boolean( + 'gke_enable_dataplane_v2', + False, + 'Whether to enable GKE Dataplane V2 (eBPF-based datapath, Cilium under the hood). ' + 'Requires cluster recreation; cannot be toggled on an existing cluster.', +) +GKE_ENABLE_AGENT_SANDBOX = flags.DEFINE_boolean( + 'gke_enable_agent_sandbox', + False, + 'Whether to enable the GKE Agent Sandbox controller on the cluster. ' + 'Installs the managed agent-sandbox controller and CRDs, enabling ' + 'SandboxClaim/Sandbox/SandboxWarmPool reconciliation by GKE. This is ' + 'separate from the gvisor sandbox runtime (--sandbox=type=gvisor on a ' + 'node pool). Requires GKE 1.35.2-gke.1269000 or later. See ' + 'https://docs.cloud.google.com/kubernetes-engine/docs/how-to/agent-sandbox.', +) + + +def _ValidateGkePrivateNodeFlags(flags_dict): + if not flags_dict['gke_enable_ip_access'] and not flags_dict['gke_enable_dns_access']: + raise flags.ValidationError( + '--no-gke_enable_ip_access requires --gke_enable_dns_access.' + ) + if (flags_dict['gke_enable_private_nodes'] and + not flags_dict['gke_enable_dns_access'] and + not flags_dict['gke_master_ipv4_cidr']): + raise flags.ValidationError( + '--gke_enable_private_nodes without --gke_enable_dns_access requires --gke_master_ipv4_cidr.' + ) + return True + + +flags.register_multi_flags_validator( + ['gke_enable_ip_access', 'gke_enable_dns_access', 'gke_enable_private_nodes', 'gke_master_ipv4_cidr'], + _ValidateGkePrivateNodeFlags, +) GKE_ADDONS = flags.DEFINE_string( 'gke_addons', '', diff --git a/perfkitbenchmarker/providers/gcp/gce_network.py b/perfkitbenchmarker/providers/gcp/gce_network.py index c4c666d47..2f32189f6 100644 --- a/perfkitbenchmarker/providers/gcp/gce_network.py +++ b/perfkitbenchmarker/providers/gcp/gce_network.py @@ -914,6 +914,7 @@ class GceNetwork(network.BaseNetwork): def __init__(self, network_spec: GceNetworkSpec): super().__init__(network_spec) self.project: str | None = network_spec.project + self._zone: str = network_spec.zone self.vpn_gateway: Dict[str, GceVpnGateway] = {} # Figuring out the type of network here. @@ -1231,6 +1232,52 @@ def _GetNumberVms(self) -> int: for group_spec in benchmark_spec.config.vm_groups.values() ) + def _CreateCloudNat(self): + """Provision a Cloud Router + NAT so private resources can egress. + + Called during network provisioning so NAT has time to fully propagate + before any cluster lifecycle code starts. Shared across all resources + in the network. + """ + region = util.GetRegionFromZone(self._zone) + router_name = f'{self.primary_subnet_name}-router' + nat_name = f'{self.primary_subnet_name}-nat' + + router_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'create', router_name) + router_cmd.flags['network'] = self.primary_subnet_name + router_cmd.flags['region'] = region + router_cmd.flags.pop('zone', None) + router_cmd.Issue() + + nat_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'nats', 'create', nat_name) + nat_cmd.flags['router'] = router_name + nat_cmd.flags['region'] = region + nat_cmd.flags.pop('zone', None) + nat_cmd.args.append('--auto-allocate-nat-external-ips') + nat_cmd.args.append('--nat-all-subnet-ip-ranges') + nat_cmd.Issue() + + def _DeleteCloudNat(self): + """Best-effort teardown of the NAT and router this network created.""" + region = util.GetRegionFromZone(self._zone) + router_name = f'{self.primary_subnet_name}-router' + nat_name = f'{self.primary_subnet_name}-nat' + + nat_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'nats', 'delete', nat_name) + nat_cmd.flags['router'] = router_name + nat_cmd.flags['region'] = region + nat_cmd.flags.pop('zone', None) + nat_cmd.Issue(raise_on_failure=False) + + router_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'delete', router_name) + router_cmd.flags['region'] = region + router_cmd.flags.pop('zone', None) + router_cmd.Issue(raise_on_failure=False) + def Create(self): """Creates the actual network.""" if not self.is_existing_network: @@ -1244,6 +1291,8 @@ def Create(self): lambda rule: self.external_nets_rules[rule].Create(), list(self.external_nets_rules.keys()), ) + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + self._CreateCloudNat() if getattr(self, 'vpn_gateway', False): background_tasks.RunThreaded( lambda gateway: self.vpn_gateway[gateway].Create(), @@ -1257,6 +1306,8 @@ def Delete(self): if self.placement_group: self.placement_group.Delete() if not self.is_existing_network: + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + self._DeleteCloudNat() if getattr(self, 'vpn_gateway', False): background_tasks.RunThreaded( lambda gateway: self.vpn_gateway[gateway].Delete(), diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index 6b0076aa6..9c2bfc6b4 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -168,6 +168,29 @@ def _RunClusterCreateCommand(self, cmd: util.GcloudCommand): ) cmd.flags['release-channel'] = self.release_channel + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + cmd.args.append('--enable-private-nodes') + # GKE requires VPC-native (alias IPs) when private nodes are enabled. + # Without this gcloud rejects the create with: + # Cannot specify --enable-private-nodes without --enable-ip-alias. + cmd.args.append('--enable-ip-alias') + else: + cmd.args.append('--no-enable-private-nodes') + if gcp_flags.GKE_ENABLE_DNS_ACCESS.value: + cmd.args.append('--enable-dns-access') + else: + cmd.args.append('--no-enable-dns-access') + if gcp_flags.GKE_ENABLE_IP_ACCESS.value: + cmd.args.append('--enable-ip-access') + else: + cmd.args.append('--no-enable-ip-access') + if gcp_flags.GKE_ENABLE_DATAPLANE_V2.value: + cmd.args.append('--enable-dataplane-v2') + if gcp_flags.GKE_ENABLE_AGENT_SANDBOX.value: + cmd.args.append('--enable-agent-sandbox') + if gcp_flags.GKE_MASTER_IPV4_CIDR.value: + cmd.flags['master-ipv4-cidr'] = gcp_flags.GKE_MASTER_IPV4_CIDR.value + if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair')