From f62ec8a164f29ddf35ee7bde0fa4c6f6cef23738 Mon Sep 17 00:00:00 2001 From: Eric Hole Date: Fri, 22 May 2026 16:42:33 -0700 Subject: [PATCH 1/3] GCP: GKE private nodes, DNS access, and Cloud NAT Enables private nodes paired with --enable-ip-alias, adds DNS access (--enable-master-authorized-networks and related settings), and provisions Cloud NAT at the network level (not at cluster create). --- perfkitbenchmarker/providers/gcp/flags.py | 40 +++++++++++++++ .../providers/gcp/gce_network.py | 51 +++++++++++++++++++ .../providers/gcp/google_kubernetes_engine.py | 19 +++++++ 3 files changed, 110 insertions(+) diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py index a56fe72b99..92c1926202 100644 --- a/perfkitbenchmarker/providers/gcp/flags.py +++ b/perfkitbenchmarker/providers/gcp/flags.py @@ -561,6 +561,46 @@ False, 'Whether to enable shielded nodes.', ) +GKE_ENABLE_PRIVATE_NODES = flags.DEFINE_boolean( + 'gke_enable_private_nodes', + False, + 'Whether to create the cluster with private nodes (nodes have only internal IPs).', +) +GKE_ENABLE_DNS_ACCESS = flags.DEFINE_boolean( + 'gke_enable_dns_access', + False, + 'Whether to enable DNS-based control plane access (replaces the public/private IP endpoint model).', +) +GKE_ENABLE_IP_ACCESS = flags.DEFINE_boolean( + 'gke_enable_ip_access', + True, + 'Whether to enable IP-based control plane access. Disabling requires DNS access and is mutually exclusive with public clusters (nodes with public IPs).', +) +GKE_MASTER_IPV4_CIDR = flags.DEFINE_string( + 'gke_master_ipv4_cidr', + None, + 'CIDR range to use for the hosted master network. Required when private nodes are enabled without DNS access.', +) + + +def _ValidateGkePrivateNodeFlags(flags_dict): + if not flags_dict['gke_enable_ip_access'] and not flags_dict['gke_enable_dns_access']: + raise flags.ValidationError( + '--no-gke_enable_ip_access requires --gke_enable_dns_access.' + ) + if (flags_dict['gke_enable_private_nodes'] and + not flags_dict['gke_enable_dns_access'] and + not flags_dict['gke_master_ipv4_cidr']): + raise flags.ValidationError( + '--gke_enable_private_nodes without --gke_enable_dns_access requires --gke_master_ipv4_cidr.' + ) + return True + + +flags.register_multi_flags_validator( + ['gke_enable_ip_access', 'gke_enable_dns_access', 'gke_enable_private_nodes', 'gke_master_ipv4_cidr'], + _ValidateGkePrivateNodeFlags, +) GKE_ADDONS = flags.DEFINE_string( 'gke_addons', '', diff --git a/perfkitbenchmarker/providers/gcp/gce_network.py b/perfkitbenchmarker/providers/gcp/gce_network.py index c4c666d478..2f32189f6e 100644 --- a/perfkitbenchmarker/providers/gcp/gce_network.py +++ b/perfkitbenchmarker/providers/gcp/gce_network.py @@ -914,6 +914,7 @@ class GceNetwork(network.BaseNetwork): def __init__(self, network_spec: GceNetworkSpec): super().__init__(network_spec) self.project: str | None = network_spec.project + self._zone: str = network_spec.zone self.vpn_gateway: Dict[str, GceVpnGateway] = {} # Figuring out the type of network here. @@ -1231,6 +1232,52 @@ def _GetNumberVms(self) -> int: for group_spec in benchmark_spec.config.vm_groups.values() ) + def _CreateCloudNat(self): + """Provision a Cloud Router + NAT so private resources can egress. + + Called during network provisioning so NAT has time to fully propagate + before any cluster lifecycle code starts. Shared across all resources + in the network. + """ + region = util.GetRegionFromZone(self._zone) + router_name = f'{self.primary_subnet_name}-router' + nat_name = f'{self.primary_subnet_name}-nat' + + router_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'create', router_name) + router_cmd.flags['network'] = self.primary_subnet_name + router_cmd.flags['region'] = region + router_cmd.flags.pop('zone', None) + router_cmd.Issue() + + nat_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'nats', 'create', nat_name) + nat_cmd.flags['router'] = router_name + nat_cmd.flags['region'] = region + nat_cmd.flags.pop('zone', None) + nat_cmd.args.append('--auto-allocate-nat-external-ips') + nat_cmd.args.append('--nat-all-subnet-ip-ranges') + nat_cmd.Issue() + + def _DeleteCloudNat(self): + """Best-effort teardown of the NAT and router this network created.""" + region = util.GetRegionFromZone(self._zone) + router_name = f'{self.primary_subnet_name}-router' + nat_name = f'{self.primary_subnet_name}-nat' + + nat_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'nats', 'delete', nat_name) + nat_cmd.flags['router'] = router_name + nat_cmd.flags['region'] = region + nat_cmd.flags.pop('zone', None) + nat_cmd.Issue(raise_on_failure=False) + + router_cmd = util.GcloudCommand( + self, 'compute', 'routers', 'delete', router_name) + router_cmd.flags['region'] = region + router_cmd.flags.pop('zone', None) + router_cmd.Issue(raise_on_failure=False) + def Create(self): """Creates the actual network.""" if not self.is_existing_network: @@ -1244,6 +1291,8 @@ def Create(self): lambda rule: self.external_nets_rules[rule].Create(), list(self.external_nets_rules.keys()), ) + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + self._CreateCloudNat() if getattr(self, 'vpn_gateway', False): background_tasks.RunThreaded( lambda gateway: self.vpn_gateway[gateway].Create(), @@ -1257,6 +1306,8 @@ def Delete(self): if self.placement_group: self.placement_group.Delete() if not self.is_existing_network: + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + self._DeleteCloudNat() if getattr(self, 'vpn_gateway', False): background_tasks.RunThreaded( lambda gateway: self.vpn_gateway[gateway].Delete(), diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index 6b0076aa69..9f5946a961 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -168,6 +168,25 @@ def _RunClusterCreateCommand(self, cmd: util.GcloudCommand): ) cmd.flags['release-channel'] = self.release_channel + if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value: + cmd.args.append('--enable-private-nodes') + # GKE requires VPC-native (alias IPs) when private nodes are enabled. + # Without this gcloud rejects the create with: + # Cannot specify --enable-private-nodes without --enable-ip-alias. + cmd.args.append('--enable-ip-alias') + else: + cmd.args.append('--no-enable-private-nodes') + if gcp_flags.GKE_ENABLE_DNS_ACCESS.value: + cmd.args.append('--enable-dns-access') + else: + cmd.args.append('--no-enable-dns-access') + if gcp_flags.GKE_ENABLE_IP_ACCESS.value: + cmd.args.append('--enable-ip-access') + else: + cmd.args.append('--no-enable-ip-access') + if gcp_flags.GKE_MASTER_IPV4_CIDR.value: + cmd.flags['master-ipv4-cidr'] = gcp_flags.GKE_MASTER_IPV4_CIDR.value + if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') From 08cd9436c0d4f725578cdfab80780869371e34ed Mon Sep 17 00:00:00 2001 From: Eric Hole Date: Fri, 22 May 2026 16:41:14 -0700 Subject: [PATCH 2/3] GCP: add --gke_enable_dataplane_v2 for cluster creation Adds an opt-in flag to enable GKE Dataplane V2 (eBPF datapath, Cilium under the hood) when provisioning a cluster. Off by default. Requires cluster recreation; the flag has no effect on an existing cluster. --- perfkitbenchmarker/providers/gcp/flags.py | 6 ++++++ .../providers/gcp/google_kubernetes_engine.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py index 92c1926202..6e68bbb4b8 100644 --- a/perfkitbenchmarker/providers/gcp/flags.py +++ b/perfkitbenchmarker/providers/gcp/flags.py @@ -581,6 +581,12 @@ None, 'CIDR range to use for the hosted master network. Required when private nodes are enabled without DNS access.', ) +GKE_ENABLE_DATAPLANE_V2 = flags.DEFINE_boolean( + 'gke_enable_dataplane_v2', + False, + 'Whether to enable GKE Dataplane V2 (eBPF-based datapath, Cilium under the hood). ' + 'Requires cluster recreation; cannot be toggled on an existing cluster.', +) def _ValidateGkePrivateNodeFlags(flags_dict): diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index 9f5946a961..fd77313884 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -184,6 +184,8 @@ def _RunClusterCreateCommand(self, cmd: util.GcloudCommand): cmd.args.append('--enable-ip-access') else: cmd.args.append('--no-enable-ip-access') + if gcp_flags.GKE_ENABLE_DATAPLANE_V2.value: + cmd.args.append('--enable-dataplane-v2') if gcp_flags.GKE_MASTER_IPV4_CIDR.value: cmd.flags['master-ipv4-cidr'] = gcp_flags.GKE_MASTER_IPV4_CIDR.value From d30d17df57182fb0bcfb7733f2641a4c5b803ce2 Mon Sep 17 00:00:00 2001 From: Eric Hole Date: Tue, 26 May 2026 15:49:50 -0700 Subject: [PATCH 3/3] gcp/gke: add --gke_enable_agent_sandbox to wire --enable-agent-sandbox GKE Agent Sandbox is a managed feature that installs the agent-sandbox controller and CRDs onto the cluster, enabling SandboxClaim / Sandbox / SandboxWarmPool reconciliation by GKE. This is separate from the gvisor sandbox runtime (already exposed via --sandbox=type=gvisor on the node pool spec): the runtime makes RuntimeClass gvisor available; this flag makes the controller available to manage CRs that use it. PKB previously had no way to enable it at cluster create time. With this flag, scenarios that want the managed controller can set --gke_enable_agent_sandbox=true and PKB appends --enable-agent-sandbox to the gcloud container clusters create command. Requires GKE 1.35.2-gke.1269000 or later. See https://docs.cloud.google.com/kubernetes-engine/docs/how-to/agent-sandbox. --- perfkitbenchmarker/providers/gcp/flags.py | 10 ++++++++++ .../providers/gcp/google_kubernetes_engine.py | 2 ++ 2 files changed, 12 insertions(+) diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py index 6e68bbb4b8..f14cc6b55f 100644 --- a/perfkitbenchmarker/providers/gcp/flags.py +++ b/perfkitbenchmarker/providers/gcp/flags.py @@ -587,6 +587,16 @@ 'Whether to enable GKE Dataplane V2 (eBPF-based datapath, Cilium under the hood). ' 'Requires cluster recreation; cannot be toggled on an existing cluster.', ) +GKE_ENABLE_AGENT_SANDBOX = flags.DEFINE_boolean( + 'gke_enable_agent_sandbox', + False, + 'Whether to enable the GKE Agent Sandbox controller on the cluster. ' + 'Installs the managed agent-sandbox controller and CRDs, enabling ' + 'SandboxClaim/Sandbox/SandboxWarmPool reconciliation by GKE. This is ' + 'separate from the gvisor sandbox runtime (--sandbox=type=gvisor on a ' + 'node pool). Requires GKE 1.35.2-gke.1269000 or later. See ' + 'https://docs.cloud.google.com/kubernetes-engine/docs/how-to/agent-sandbox.', +) def _ValidateGkePrivateNodeFlags(flags_dict): diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index fd77313884..9c2bfc6b4b 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -186,6 +186,8 @@ def _RunClusterCreateCommand(self, cmd: util.GcloudCommand): cmd.args.append('--no-enable-ip-access') if gcp_flags.GKE_ENABLE_DATAPLANE_V2.value: cmd.args.append('--enable-dataplane-v2') + if gcp_flags.GKE_ENABLE_AGENT_SANDBOX.value: + cmd.args.append('--enable-agent-sandbox') if gcp_flags.GKE_MASTER_IPV4_CIDR.value: cmd.flags['master-ipv4-cidr'] = gcp_flags.GKE_MASTER_IPV4_CIDR.value