diff --git a/.github/buildomat/common.sh b/.github/buildomat/common.sh index f8df2081..2d2df618 100644 --- a/.github/buildomat/common.sh +++ b/.github/buildomat/common.sh @@ -1,14 +1,17 @@ #!/bin/bash -# The tofino2 has 20 stages, and the current sidecar.p4 needs all 20 of them. -# Specifying the number of stages isn't strictly necessary, but it allows us to -# track when we exceed the current ceiling. The underlying intention is to grow -# deliberately and thoughtfully, given the limited space on the ASIC. +# The tofino2 has 20 stages. The base sidecar.p4 builds at 16 stages, and, +# with multicast enabled, at 19. Specifying the number of stages isn't +# strictly necessary, but it allows us to track when we exceed the current +# ceiling. The underlying intention is to grow deliberately and thoughtfully, +# given the limited space on the ASIC. # -# Note: this now seems silly since we have maxed out the number of stages, but -# we want to leave this check and note in place should we ever find a way to -# reduce our footprint below 20 stages. -TOFINO_STAGES=20 +# Note: p4c does multiple placement rounds. table_summary.log reports each +# round. The first (unconstrained) is informational, and the final (at the +# bottom of the log) is what the binary actually uses. If +# --num-stages-override cannot be satisfied, the assembler errors out and +# no binary is produced. +TOFINO_STAGES=16 # These describe which version of the SDE to download and where to find it SDE_COMMIT=2a6b33211c9675996dcb99fe939045506667ae94 diff --git a/.github/buildomat/packet-test-common.sh b/.github/buildomat/packet-test-common.sh index e8d81cd6..2da95153 100755 --- a/.github/buildomat/packet-test-common.sh +++ b/.github/buildomat/packet-test-common.sh @@ -17,7 +17,13 @@ if [ x$MULTICAST == x ]; then CODEGEN_FEATURES=--multicast SWADM_FEATURES=--features=multicast fi - + +if [ x$MULTICAST == x ]; then + TOFINO_STAGES=16 + else + TOFINO_STAGES=19 +fi + function cleanup { set +o errexit set +o pipefail diff --git a/README.md b/README.md index 98b8fa1e..0d3df5d2 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,11 @@ To build the p4 program: $ cargo xtask codegen [ -n ] [--sde ] ``` +*Note*: The final stage count for the binary is reported in +`target/proto/opt/oxide/dendrite/sidecar/pipe/logs/table_summary.log`. +If the requested stage allotment cannot be met, the assembler errors +out and no binary is produced. + The Tofino model is not yet available for `illumos`/`helios`. To run the compiled p4 program on the Tofino model on a Linux system: diff --git a/asic/src/softnpu/table.rs b/asic/src/softnpu/table.rs index 56d9c1fe..322b9e70 100644 --- a/asic/src/softnpu/table.rs +++ b/asic/src/softnpu/table.rs @@ -112,6 +112,26 @@ impl TableOps for Table { trace!(hdl.log, "match_data:\n{:#?}", match_data); trace!(hdl.log, "action_data:\n{:#?}", action_data); + // Route tables are idx-only in sidecar-lite and route_ttl_is_1 + // is ignored here. + // + // TODO: remove compat once https://github.com/oxidecomputer/sidecar-lite/pull/152 + // is merged and sidecar-lite updates route keys/actions accordingly. + let is_route_table = matches!( + self.type_, + TableType::RouteFwdIpv4 | TableType::RouteFwdIpv6 + ); + if is_route_table { + if route_ttl_is_1(&match_data.fields) { + trace!(hdl.log, "skipping ttl==1 route entry for {name}"); + return Ok(()); + } + if action_data.action == "ttl_exceeded" { + trace!(hdl.log, "skipping ttl_exceeded action for {name}"); + return Ok(()); + } + } + let keyset_data = keyset_data(match_data.fields, self.type_); let (action, parameter_data) = match ( @@ -428,6 +448,22 @@ impl TableOps for Table { } ("rewrite", params) } + #[cfg(feature = "multicast")] + (TableType::PortMacAddressMcast, "rewrite") => { + let mut params = Vec::new(); + for arg in action_data.args { + match arg.value { + ValueTypes::U64(v) => { + let mac = v.to_le_bytes(); + params.extend_from_slice(&mac[0..6]); + } + ValueTypes::Ptr(v) => { + params.extend_from_slice(v.as_slice()); + } + } + } + ("rewrite", params) + } (TableType::NatIngressIpv4, "forward_ipv4_to") | (TableType::NatIngressIpv6, "forward_ipv6_to") | (TableType::AttachedSubnetIpv4, "forward_to_v4") @@ -573,6 +609,15 @@ impl TableOps for Table { trace!(hdl.log, "table: {name}"); trace!(hdl.log, "match_data:\n{:#?}", match_data); + let is_route_table = matches!( + self.type_, + TableType::RouteFwdIpv4 | TableType::RouteFwdIpv6 + ); + if is_route_table && route_ttl_is_1(&match_data.fields) { + trace!(hdl.log, "skipping ttl==1 route entry delete for {name}"); + return Ok(()); + } + let keyset_data = keyset_data(match_data.fields, self.type_); trace!(hdl.log, "sending request to softnpu"); @@ -632,10 +677,12 @@ fn keyset_data(match_data: Vec, table: TableType) -> Vec { serialize_value_type(&x, &mut data); keyset_data.extend_from_slice(&data[..2]); } - TableType::RouteIdxIpv4 => { - // "idx" => exact => bit<16> - serialize_value_type(&x, &mut data); - keyset_data.extend_from_slice(&data[..2]); + TableType::RouteFwdIpv4 | TableType::RouteFwdIpv6 => { + // sidecar-lite route keys are idx-only. + if m.name == "idx" { + serialize_value_type(&x, &mut data); + keyset_data.extend_from_slice(&data[..2]); + } } TableType::NatIngressIpv4 => { // "dst_addr" => hdr.ipv4.dst: exact => bit<32> @@ -725,3 +772,18 @@ fn serialize_value_type_be(x: &ValueTypes, data: &mut Vec) { } } } + +fn route_ttl_is_1(fields: &[MatchEntryField]) -> bool { + fields.iter().any(|field| { + if field.name != "route_ttl_is_1" { + return false; + } + match &field.value { + MatchEntryValue::Value(ValueTypes::U64(v)) => *v != 0, + MatchEntryValue::Value(ValueTypes::Ptr(v)) => { + v.first().is_some_and(|b| *b != 0) + } + _ => false, + } + }) +} diff --git a/asic/src/tofino_common/mod.rs b/asic/src/tofino_common/mod.rs index 7e4ff064..af23636c 100644 --- a/asic/src/tofino_common/mod.rs +++ b/asic/src/tofino_common/mod.rs @@ -24,16 +24,16 @@ pub mod ports; fn table_name(type_: TableType) -> &'static str { match type_ { TableType::RouteIdxIpv4 => { - "pipe.Ingress.l3_router.Router4.lookup_idx.lookup" + "pipe.Ingress.l3_router.router4.lookup_idx.lookup" } TableType::RouteFwdIpv4 => { - "pipe.Ingress.l3_router.Router4.lookup_idx.route" + "pipe.Ingress.l3_router.router4.lookup_idx.route" } TableType::RouteIdxIpv6 => { - "pipe.Ingress.l3_router.Router6.lookup_idx.lookup" + "pipe.Ingress.l3_router.router6.lookup_idx.lookup" } TableType::RouteFwdIpv6 => { - "pipe.Ingress.l3_router.Router6.lookup_idx.route" + "pipe.Ingress.l3_router.router6.lookup_idx.route" } #[cfg(feature = "multicast")] TableType::RouteIpv4Mcast => { @@ -45,13 +45,15 @@ fn table_name(type_: TableType) -> &'static str { } TableType::ArpIpv4 => "pipe.Ingress.l3_router.Arp.tbl", TableType::NeighborIpv6 => "pipe.Ingress.l3_router.Ndp.tbl", - TableType::PortMacAddress => "pipe.Ingress.mac_rewrite.mac_rewrite", + TableType::PortMacAddress => { + "pipe.Egress.unicast_mac_rewrite.mac_rewrite" + } TableType::PortAddrIpv4 => "pipe.Ingress.filter.switch_ipv4_addr", TableType::PortAddrIpv6 => "pipe.Ingress.filter.switch_ipv6_addr", TableType::NatIngressIpv4 => "pipe.Ingress.nat_ingress.ingress_ipv4", TableType::NatIngressIpv6 => "pipe.Ingress.nat_ingress.ingress_ipv6", TableType::UplinkIngress => "pipe.Ingress.filter.uplink_ports", - TableType::UplinkEgress => "pipe.Ingress.egress_filter.egress_filter", + TableType::UplinkEgress => "pipe.Egress.egress_filter.egress_filter", TableType::AttachedSubnetIpv4 => { "pipe.Ingress.attached_subnet_ingress.attached_subnets_v4" } @@ -79,7 +81,9 @@ fn table_name(type_: TableType) -> &'static str { "pipe.Ingress.nat_ingress.ingress_ipv6_mcast" } #[cfg(feature = "multicast")] - TableType::PortMacAddressMcast => "pipe.Egress.mac_rewrite.mac_rewrite", + TableType::PortMacAddressMcast => { + "pipe.Egress.mcast_mac_rewrite.mac_rewrite" + } #[cfg(feature = "multicast")] TableType::McastEgressDecapPorts => { "pipe.Egress.mcast_egress.tbl_decap_ports" @@ -97,9 +101,13 @@ fn counter_table_name(id: CounterId) -> &'static str { CounterId::Service => "pipe.Ingress.services.service_ctr", CounterId::Ingress => "pipe.Ingress.ingress_ctr", CounterId::Packet => "pipe.Ingress.packet_ctr", - CounterId::Egress => "pipe.Ingress.egress_ctr", CounterId::DropPort => "pipe.Ingress.drop_port_ctr", CounterId::DropReason => "pipe.Ingress.drop_reason_ctr", + CounterId::Forwarded => "pipe.Egress.forwarded_ctr", + CounterId::Unicast => "pipe.Egress.unicast_ctr", + CounterId::MulticastLL => "pipe.Egress.link_local_mcast_ctr", + CounterId::EgressDropPort => "pipe.Egress.drop_port_ctr", + CounterId::EgressDropReason => "pipe.Egress.drop_reason_ctr", #[cfg(feature = "multicast")] CounterId::Multicast(id) => mulitcast_counter_table_name(id), } @@ -108,16 +116,9 @@ fn counter_table_name(id: CounterId) -> &'static str { #[cfg(feature = "multicast")] fn mulitcast_counter_table_name(id: MulticastCounterId) -> &'static str { match id { - MulticastCounterId::EgressDropPort => "pipe.Egress.drop_port_ctr", - MulticastCounterId::EgressDropReason => "pipe.Egress.drop_reason_ctr", - MulticastCounterId::Unicast => "pipe.Egress.unicast_ctr", MulticastCounterId::Multicast => "pipe.Egress.mcast_ctr", MulticastCounterId::MulticastExt => "pipe.Egress.external_mcast_ctr", - MulticastCounterId::MulticastLL => "pipe.Egress.link_local_mcast_ctr", MulticastCounterId::MulticastUL => "pipe.Egress.underlay_mcast_ctr", - MulticastCounterId::MulticastDrop => { - "pipe.Ingress.filter.drop_mcast_ctr" - } } } diff --git a/common/src/counters.rs b/common/src/counters.rs index be4d19c3..387954ac 100644 --- a/common/src/counters.rs +++ b/common/src/counters.rs @@ -205,10 +205,16 @@ pub struct FecRSCounters { pub enum CounterId { Service, Ingress, - Egress, Packet, DropPort, DropReason, + Forwarded, + Unicast, + /// Link-local IPv6 multicast (ff02::/16). Not feature-gated because + /// link-local forwarding uses standard routing, not replication groups. + MulticastLL, + EgressDropPort, + EgressDropReason, #[cfg(feature = "multicast")] Multicast(MulticastCounterId), } @@ -227,14 +233,9 @@ pub enum CounterId { )] #[cfg(feature = "multicast")] pub enum MulticastCounterId { - EgressDropPort, - EgressDropReason, - Unicast, Multicast, MulticastExt, - MulticastLL, MulticastUL, - MulticastDrop, } impl fmt::Display for CounterId { @@ -245,10 +246,14 @@ impl fmt::Display for CounterId { match self { CounterId::Service => "Service".to_string(), CounterId::Ingress => "Ingress".to_string(), - CounterId::Egress => "Egress".to_string(), CounterId::Packet => "Packet".to_string(), CounterId::DropPort => "Ingress_Drop_Port".to_string(), CounterId::DropReason => "Ingress_Drop_Reason".to_string(), + CounterId::Forwarded => "Forwarded".to_string(), + CounterId::Unicast => "Unicast".to_string(), + CounterId::MulticastLL => "Multicast_Link_Local".to_string(), + CounterId::EgressDropPort => "Egress_Drop_Port".to_string(), + CounterId::EgressDropReason => "Egress_Drop_Reason".to_string(), #[cfg(feature = "multicast")] CounterId::Multicast(id) => id.to_string(), } @@ -263,36 +268,25 @@ impl std::str::FromStr for CounterId { match s.to_lowercase().replace(['_'], "").as_str() { "service" => Ok(CounterId::Service), "ingress" => Ok(CounterId::Ingress), - "egress" => Ok(CounterId::Egress), "packet" => Ok(CounterId::Packet), "ingressdropport" => Ok(CounterId::DropPort), "ingressdropreason" => Ok(CounterId::DropReason), + "forwarded" => Ok(CounterId::Forwarded), + "unicast" => Ok(CounterId::Unicast), + "multicastll" | "multicastlinklocal" => Ok(CounterId::MulticastLL), + "egressdropport" => Ok(CounterId::EgressDropPort), + "egressdropreason" => Ok(CounterId::EgressDropReason), #[cfg(feature = "multicast")] x => match x { - "egressdropport" => { - Ok(CounterId::Multicast(MulticastCounterId::EgressDropPort)) - } - "egressdropreason" => Ok(CounterId::Multicast( - MulticastCounterId::EgressDropReason, - )), - "unicast" => { - Ok(CounterId::Multicast(MulticastCounterId::Unicast)) - } "multicast" => { Ok(CounterId::Multicast(MulticastCounterId::Multicast)) } "multicastext" | "multicastexternal" => { Ok(CounterId::Multicast(MulticastCounterId::MulticastExt)) } - "multicastll" | "multicastlinklocal" => { - Ok(CounterId::Multicast(MulticastCounterId::MulticastLL)) - } "multicastul" | "multicastunderlay" => { Ok(CounterId::Multicast(MulticastCounterId::MulticastUL)) } - "multicastdrop" => { - Ok(CounterId::Multicast(MulticastCounterId::MulticastDrop)) - } x => Err(format!("No such counter: {x}")), }, #[cfg(not(feature = "multicast"))] @@ -300,6 +294,7 @@ impl std::str::FromStr for CounterId { } } } + #[cfg(feature = "multicast")] impl fmt::Display for MulticastCounterId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -307,14 +302,9 @@ impl fmt::Display for MulticastCounterId { f, "{}", match self { - MulticastCounterId::EgressDropPort => "Egress_Drop_Port", - MulticastCounterId::EgressDropReason => "Egress_Drop_Reason", - MulticastCounterId::Unicast => "Unicast", MulticastCounterId::Multicast => "Multicast", MulticastCounterId::MulticastExt => "Multicast_External", - MulticastCounterId::MulticastLL => "Multicast_Link_Local", MulticastCounterId::MulticastUL => "Multicast_Underlay", - MulticastCounterId::MulticastDrop => "Multicast_Drop", } ) } diff --git a/common/src/illumos.rs b/common/src/illumos.rs index 5eca9c2b..095663bf 100644 --- a/common/src/illumos.rs +++ b/common/src/illumos.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Illumos-specific common modules and operations. diff --git a/dpd-client/tests/integration_tests/common.rs b/dpd-client/tests/integration_tests/common.rs index 3d50f82a..1572f1f2 100644 --- a/dpd-client/tests/integration_tests/common.rs +++ b/dpd-client/tests/integration_tests/common.rs @@ -38,6 +38,8 @@ const SHOW_HEX: u8 = 0x02; /// Admin-local IPv6 multicast prefix (ff04::/16, scope 4). pub const ADMIN_LOCAL_MULTICAST_PREFIX: u16 = 0xFF04; +pub const DEFAULT_TEST_TAG: &str = "test"; +pub const NON_MATCHING_TEST_TAG: &str = "failed"; // Timeout set on `Pcap` objects. // @@ -254,7 +256,8 @@ impl Switch { let drain = slog_term::FullFormat::new(decorator).build().fuse(); let drain = slog_async::Async::new(drain).build().fuse(); let log = slog::Logger::root(drain, slog::o!()); - let client_state = ClientState { tag: String::from("test"), log }; + let client_state = + ClientState { tag: String::from(DEFAULT_TEST_TAG), log }; let client = Client::new( &format!("http://{ctrl_host}:{ctrl_port}"), client_state, @@ -1443,7 +1446,9 @@ pub fn gen_arp_reply(src: Endpoint, tgt: Endpoint) -> Packet { pub mod prelude { pub use super::ADMIN_LOCAL_MULTICAST_PREFIX; + pub use super::DEFAULT_TEST_TAG; pub use super::NO_PORT; + pub use super::NON_MATCHING_TEST_TAG; pub use super::PhysPort; pub use super::SERVICE_PORT; pub use super::Switch; diff --git a/dpd-client/tests/integration_tests/loopback.rs b/dpd-client/tests/integration_tests/loopback.rs index c5e90b40..746f015f 100644 --- a/dpd-client/tests/integration_tests/loopback.rs +++ b/dpd-client/tests/integration_tests/loopback.rs @@ -21,13 +21,19 @@ async fn test_api() -> TestResult { switch .client - .loopback_ipv4_create(&Ipv4Entry { tag: "test".into(), addr: lo4 }) + .loopback_ipv4_create(&Ipv4Entry { + tag: DEFAULT_TEST_TAG.into(), + addr: lo4, + }) .await .expect("Should be able to create IPv4 loopback addr"); switch .client - .loopback_ipv6_create(&Ipv6Entry { tag: "test".into(), addr: lo6 }) + .loopback_ipv6_create(&Ipv6Entry { + tag: DEFAULT_TEST_TAG.into(), + addr: lo6, + }) .await .expect("Should be able to create IPv6 loopback addr"); @@ -47,13 +53,19 @@ async fn test_api() -> TestResult { switch .client - .loopback_ipv4_create(&Ipv4Entry { tag: "test".into(), addr: lo4 }) + .loopback_ipv4_create(&Ipv4Entry { + tag: DEFAULT_TEST_TAG.into(), + addr: lo4, + }) .await .expect("IPv4 loopback add should be idempotent"); switch .client - .loopback_ipv6_create(&Ipv6Entry { tag: "test".into(), addr: lo6 }) + .loopback_ipv6_create(&Ipv6Entry { + tag: DEFAULT_TEST_TAG.into(), + addr: lo6, + }) .await .expect("IPv6 loopback add should be idempotent"); diff --git a/dpd-client/tests/integration_tests/mcast.rs b/dpd-client/tests/integration_tests/mcast.rs index 1a97c6e2..d1ea4db7 100644 --- a/dpd-client/tests/integration_tests/mcast.rs +++ b/dpd-client/tests/integration_tests/mcast.rs @@ -1710,14 +1710,6 @@ async fn test_ipv6_multicast_invalid_destination_mac() -> TestResult { let ctr_baseline = switch.get_counter("multicast_invalid_mac", None).await.unwrap(); - let port_label_ingress = switch.port_label(ingress).unwrap(); - - // Check the Multicast_Drop counter baseline for the ingress port - let drop_mcast_baseline = switch - .get_counter(&port_label_ingress, Some("multicast_drop")) - .await - .unwrap(); - switch.packet_test(vec![test_pkt], expected_pkts).unwrap(); check_counter_incremented( @@ -1730,17 +1722,6 @@ async fn test_ipv6_multicast_invalid_destination_mac() -> TestResult { .await .unwrap(); - // Verify that the Multicast_Drop counter also incremented - check_counter_incremented( - switch, - &port_label_ingress, - drop_mcast_baseline, - 1, - Some("multicast_drop"), - ) - .await - .unwrap(); - cleanup_test_group(switch, get_group_ip(&created_group), TEST_TAG).await } diff --git a/dpd-client/tests/integration_tests/port_api.rs b/dpd-client/tests/integration_tests/port_api.rs index 98fe8675..069213a7 100644 --- a/dpd-client/tests/integration_tests/port_api.rs +++ b/dpd-client/tests/integration_tests/port_api.rs @@ -519,7 +519,7 @@ async fn test_reset_tag() -> TestResult { addr_compare(vec![ipv4a, ipv4b], l).unwrap(); // Send a reset with the wrong tag, and all the addresses should be set - switch.client.reset_all_tagged("fail").await.unwrap(); + switch.client.reset_all_tagged(NON_MATCHING_TEST_TAG).await.unwrap(); let l = switch .client @@ -539,7 +539,7 @@ async fn test_reset_tag() -> TestResult { addr_compare(vec![ipv4a, ipv4b], l).unwrap(); // Send a reset with the correct tag, and all the addresses should be gone - switch.client.reset_all_tagged("test").await.unwrap(); + switch.client.reset_all_tagged(DEFAULT_TEST_TAG).await.unwrap(); assert!( switch .client diff --git a/dpd-client/tests/integration_tests/route_ipv4.rs b/dpd-client/tests/integration_tests/route_ipv4.rs index 1ef036b4..62d79521 100644 --- a/dpd-client/tests/integration_tests/route_ipv4.rs +++ b/dpd-client/tests/integration_tests/route_ipv4.rs @@ -16,6 +16,7 @@ use crate::integration_tests::common::prelude::*; use packet::Endpoint; use packet::eth::EthQHdr; +use dpd_client::ClientInfo; use dpd_client::types; struct Router { @@ -37,7 +38,7 @@ impl Router { port_id, link_id, tgt_ip: self.ip.parse().unwrap(), - tag: "testing".into(), + tag: switch.client.inner().tag.clone(), vlan_id: self.vlan, } } @@ -346,7 +347,11 @@ async fn test_reset() -> TestResult { .into_inner(); assert_eq!(routes.items.len(), 3); - switch.client.reset_all_tagged("failed").await.unwrap(); + switch + .client + .reset_all_tagged(common::NON_MATCHING_TEST_TAG) + .await + .unwrap(); let routes = switch .client .route_ipv4_list(Some(limit), None) @@ -355,7 +360,7 @@ async fn test_reset() -> TestResult { .into_inner(); assert_eq!(routes.items.len(), 3); - switch.client.reset_all_tagged("test").await.unwrap(); + switch.client.reset_all_tagged(common::DEFAULT_TEST_TAG).await.unwrap(); let routes = switch .client .route_ipv4_list(Some(limit), None) @@ -720,28 +725,28 @@ async fn test_multipath_mixed_delete() -> TestResult { let (port_13, link_13) = switch.link_id(PhysPort(13)).unwrap(); let v4_a = types::Ipv4Route { - tag: "testing".into(), + tag: common::DEFAULT_TEST_TAG.into(), port_id: port_10, link_id: link_10, tgt_ip: "203.0.47.1".parse().unwrap(), vlan_id: None, }; let v6_b = types::Ipv6Route { - tag: "testing".into(), + tag: common::DEFAULT_TEST_TAG.into(), port_id: port_11, link_id: link_11, tgt_ip: "fe80::1".parse().unwrap(), vlan_id: None, }; let v4_c = types::Ipv4Route { - tag: "testing".into(), + tag: common::DEFAULT_TEST_TAG.into(), port_id: port_12, link_id: link_12, tgt_ip: "203.0.22.1".parse().unwrap(), vlan_id: None, }; let v6_d = types::Ipv6Route { - tag: "testing".into(), + tag: common::DEFAULT_TEST_TAG.into(), port_id: port_13, link_id: link_13, tgt_ip: "fe80::2".parse().unwrap(), diff --git a/dpd-client/tests/integration_tests/route_ipv6.rs b/dpd-client/tests/integration_tests/route_ipv6.rs index 923247c8..7f5f0c6b 100644 --- a/dpd-client/tests/integration_tests/route_ipv6.rs +++ b/dpd-client/tests/integration_tests/route_ipv6.rs @@ -16,6 +16,7 @@ use crate::integration_tests::common; use crate::integration_tests::common::prelude::*; use packet::eth::EthQHdr; +use dpd_client::ClientInfo; use dpd_client::types; #[derive(Debug)] @@ -38,7 +39,7 @@ impl Router { port_id, link_id, tgt_ip: self.ip.parse().unwrap(), - tag: "testing".into(), + tag: switch.client.inner().tag.clone(), vlan_id: self.vlan, } } @@ -536,7 +537,11 @@ async fn test_reset() -> TestResult { .into_inner(); assert_eq!(routes.items.len(), 3); - switch.client.reset_all_tagged("failed").await.unwrap(); + switch + .client + .reset_all_tagged(common::NON_MATCHING_TEST_TAG) + .await + .unwrap(); let routes = switch .client .route_ipv6_list(Some(limit), None) @@ -545,7 +550,7 @@ async fn test_reset() -> TestResult { .into_inner(); assert_eq!(routes.items.len(), 3); - switch.client.reset_all_tagged("test").await.unwrap(); + switch.client.reset_all_tagged(common::DEFAULT_TEST_TAG).await.unwrap(); let routes = switch .client .route_ipv6_list(Some(limit), None) @@ -568,7 +573,7 @@ async fn test_create_and_set_semantics_v6() -> TestResult { port_id, link_id, tgt_ip: "fe80::1701:d:2000:47".parse().unwrap(), - tag: "testing".into(), + tag: common::DEFAULT_TEST_TAG.into(), vlan_id: None, }; @@ -600,6 +605,7 @@ async fn test_create_and_set_semantics_v6() -> TestResult { assert_eq!(rt.len(), 1); assert_eq!(rt[0].tgt_ip, target33.tgt_ip); + switch.client.reset_all_tagged(common::DEFAULT_TEST_TAG).await.unwrap(); Ok(()) } @@ -710,3 +716,76 @@ async fn skip_test_multipath_traffic_vlan() -> TestResult { } Ok(()) } + +// IPv6 prefixes drive TTL=1 handling per-prefix via the `skip_ttl` bit +// on the index action. Mixed ECMP target sets (service port + normal +// egress port) would cause hash-selected non-service targets to skip +// the dataplane TTL exception, so dpd rejects them at the API. +#[tokio::test] +#[ignore] +async fn test_mixed_service_port_ecmp_rejected_v6() -> TestResult { + let switch = &*get_switch().await; + let client = &switch.client; + + let cidr: Ipv6Net = "fd00:1122:3344:0500::/64".parse().unwrap(); + let (svc_port_id, svc_link_id) = switch.link_id(SERVICE_PORT).unwrap(); + let (normal_port_id, normal_link_id) = + switch.link_id(PhysPort(11)).unwrap(); + + let svc_target = types::Ipv6Route { + port_id: svc_port_id, + link_id: svc_link_id, + tgt_ip: "fd00:1122:7788:0101::4".parse().unwrap(), + tag: common::DEFAULT_TEST_TAG.into(), + vlan_id: None, + }; + let normal_target = types::Ipv6Route { + port_id: normal_port_id, + link_id: normal_link_id, + tgt_ip: "fd00:1122:7788:0102::4".parse().unwrap(), + tag: common::DEFAULT_TEST_TAG.into(), + vlan_id: None, + }; + + // Service-port-only set is fine. + client.route_ipv6_set(&build_route_add(cidr, &svc_target)).await?; + + // Adding a normal target on top would mix the set. Expect a 4xx. + let err = client + .route_ipv6_add(&build_route_add(cidr, &normal_target)) + .await + .expect_err("mixed ECMP set should be rejected"); + let dpd_client::Error::ErrorResponse(inner) = err else { + panic!("expected an error response, got: {err:?}"); + }; + assert!( + inner.status().is_client_error(), + "expected 4xx, got {}", + inner.status() + ); + + // The reverse direction is also rejected. Replace the service-port route + // with a normal-port one first. + client + .route_ipv6_set(&types::Ipv6RouteUpdate { + cidr, + target: normal_target.clone(), + replace: true, + }) + .await?; + let err = client + .route_ipv6_add(&build_route_add(cidr, &svc_target)) + .await + .expect_err("mixed ECMP set should be rejected"); + let dpd_client::Error::ErrorResponse(inner) = err else { + panic!("expected an error response, got: {err:?}"); + }; + assert!( + inner.status().is_client_error(), + "expected 4xx, got {}", + inner.status() + ); + + switch.client.reset_all_tagged(common::DEFAULT_TEST_TAG).await.unwrap(); + Ok(()) +} diff --git a/dpd-client/tests/integration_tests/table_tests.rs b/dpd-client/tests/integration_tests/table_tests.rs index f764ad15..11dfe564 100644 --- a/dpd-client/tests/integration_tests/table_tests.rs +++ b/dpd-client/tests/integration_tests/table_tests.rs @@ -41,19 +41,11 @@ use crate::integration_tests::common::prelude::*; // code. If the table size appears to change dramatically, that's worth // investigating. If it only changes by an entry or two, it's fine to just // adjust the constant below to match the observed result. -// -// TODO: Multicast drops IPv4 LPM capacity to 7164 (from 8187) due to -// ingress TCAM pressure. Investigate moving MulticastRouter4/6 into the -// egress pipeline to reclaim capacity. -#[cfg(feature = "multicast")] -const IPV4_LPM_SIZE: usize = 7164; // ipv4 forwarding table -#[cfg(not(feature = "multicast"))] const IPV4_LPM_SIZE: usize = 8191; // ipv4 forwarding table - -#[cfg(feature = "multicast")] -const IPV6_LPM_SIZE: usize = 1023; // ipv6 forwarding table -#[cfg(not(feature = "multicast"))] -const IPV6_LPM_SIZE: usize = 8191; // ipv6 forwarding table +// Native v6 LPM (cuckoo hash) caps a few entries below the P4-declared +// IPV6_LPM_SIZE = 8192. The lookup table is sized without padding to keep +// the ingress stage budget. Effective hardware capacity is 8188. +const IPV6_LPM_SIZE: usize = 8188; // ipv6 forwarding table const SWITCH_IPV4_ADDRS_SIZE: usize = 511; // ipv4 addrs assigned to our ports const SWITCH_IPV6_ADDRS_SIZE: usize = 511; // ipv6 addrs assigned to our ports @@ -64,7 +56,7 @@ const IPV6_NEIGHBOR_SIZE: usize = 512; // ipv6 neighbor cache /// The size of the multicast table related to replication on /// admin-local (internal) multicast groups. #[cfg(feature = "multicast")] -const MULTICAST_TABLE_SIZE: usize = 1024; +const MCAST_REPLICATION_IPV6_SIZE: usize = 2048; #[cfg(feature = "multicast")] const MCAST_TAG: &str = "mcast_table_test"; // multicast group tag @@ -546,6 +538,6 @@ async fn test_multicast_replication_table_full() -> TestResult { MulticastReplicationTableTest, types::MulticastGroupUnderlayResponse, (), - >(MULTICAST_TABLE_SIZE) + >(MCAST_REPLICATION_IPV6_SIZE) .await } diff --git a/dpd/p4/constants.p4 b/dpd/p4/constants.p4 index 7f6584ff..018fb4ea 100644 --- a/dpd/p4/constants.p4 +++ b/dpd/p4/constants.p4 @@ -4,24 +4,57 @@ // // Copyright 2026 Oxide Computer Company -const bit<16> L2_ISOLATED_FLAG = 0x8000; +// Multicast MAC prefixes per RFC 1112 and RFC 2464. +const bit<24> IPV4_MCAST_MAC_PREFIX = 0x01005e; +const bit<16> IPV6_MCAST_MAC_PREFIX = 0x3333; // TODO: these all need to be bigger. Early experimentation is showing that this // is going to need to come either through ATCAM/ALPM or code restructuring. const int IPV4_NAT_TABLE_SIZE = 1024; // nat routing table const int IPV6_NAT_TABLE_SIZE = 1024; // nat routing table const int IPV4_LPM_SIZE = 8192; // ipv4 forwarding table -#ifdef MULTICAST -const int IPV6_LPM_SIZE = 1024; // ipv6 forwarding table -#else const int IPV6_LPM_SIZE = 8192; // ipv6 forwarding table -#endif +// v4 compound TTL key (forward + ttl_exceeded) keeps per-target TTL +// dispatch for mixed-ECMP support. v6 inlines TTL=1 to save a stage and +// uses 1 entry per logical route. +const int FWD_ENTRIES_PER_ROUTE_V4 = 2; const int IPV4_ARP_SIZE = 512; // arp cache const int IPV6_NEIGHBOR_SIZE = 512; // ipv6 neighbor cache const int SWITCH_IPV4_ADDRS_SIZE = 512; // ipv4 addrs assigned to our ports const int SWITCH_IPV6_ADDRS_SIZE = 512; // ipv6 addrs assigned to our ports -const int IPV4_MULTICAST_TABLE_SIZE = 1024; // multicast routing table(s) for IPv4 -const int IPV6_MULTICAST_TABLE_SIZE = 1024; // multicast routing table(s) for IPv6 +#ifdef MULTICAST +// Per-table sizes. Each mcast lookup is sized to its own workload rather +// than a shared global ceiling, letting p4c's table placement co-locate +// small lookups with other tables in the same stage. +// +// Every overlay group has a 1:1 mapping to an underlay group (see RFD 488, +// section MRIB Population). Overlay addresses are v4 or v6, underlay addresses +// are always v6 in the admin-local scope (ff04::/16). Omicron allocates +// underlay groups from ff04::/64 within that scope. The ingress tables key on +// the overlay address, the replication table keys on the underlay address. +// +// Today's workloads are predominantly v4 overlay multicast. The v6 ingress +// and router tables are sized for symmetry and for the admin-scoped +// overlay-to-underlay mapping, not against a v6 customer workload model. +// MCAST_REPLICATION_IPV6_SIZE bounds the number of distinct underlay v6 +// destination addresses installable in mcast_replication_ipv6. The action +// hands the packet to the Tofino PRE (mcast_grp_a/b, rid, exclusion ids), +// which performs the actual replication. +const int INGRESS_IPV4_MCAST_SIZE = 2048; // v4 overlay NAT-encap lookup +const int INGRESS_IPV6_MCAST_SIZE = 2048; // v6 overlay NAT-encap lookup +const int MCAST_ROUTER_IPV4_SIZE = 2048; // v4 route table, matches ingress +const int MCAST_ROUTER_IPV6_SIZE = 2048; // v6 route table, matches ingress +const int MCAST_REPLICATION_IPV6_SIZE = 2048; // underlay v6 groups (PRE replicates) +// Source filter holds (src, dst) pairs. "Any source" groups (ASM with no +// specific sources) consume 1 entry (/0 wildcard). Groups with a specific +// source list (SSM, or ASM with INCLUDE-mode sources) consume one entry +// per source, capped at MAX_SSM_SOURCE_IPS = 32 (omicron policy). 512 +// entries fits 16 fully-saturated groups or many hundreds of typical-mix +// groups. +const int MCAST_SOURCE_FILTER_IPV4_SIZE = 512; // v4 source filtering +const int MCAST_SOURCE_FILTER_IPV6_SIZE = 512; // v6 source filtering +const int MCAST_DECAP_PORTS_SIZE = 2048; // egress decap-port bitmap +#endif /* MULTICAST */ const int ATTACHED_SUBNETS_V4_SIZE = 512; // external subnets mapped to instances const int ATTACHED_SUBNETS_V6_SIZE = 512; // external subnets mapped to instances @@ -55,17 +88,26 @@ const bit<8> SVC_COUNTER_INBOUND_LL = 5; const bit<8> SVC_COUNTER_PASS = 6; const bit<32> SVC_COUNTER_MAX = 7; +#ifdef MULTICAST /* Encapped Multicast Tags */ const bit<2> MULTICAST_TAG_EXTERNAL = 0; const bit<2> MULTICAST_TAG_UNDERLAY = 1; const bit<2> MULTICAST_TAG_UNDERLAY_EXTERNAL = 2; +const bit<2> MULTICAST_TAG_INVALID = 3; // Sentinel for missing/invalid header +#endif /* MULTICAST */ +/* IPv6 multicast scope constants (16-bit prefix for parser select) */ +const bit<16> IPV6_INTERFACE_LOCAL_16 = 0xff01; // ff01::/16 +const bit<16> IPV6_LINK_LOCAL_16 = 0xff02; // ff02::/16 + +#ifdef MULTICAST /* IPv6 Address Mask and Pattern Constants */ // Reserved underlay multicast subnet (ff04::/64). This /64 within admin-local // scope is reserved for internal underlay multicast allocation. Customer // external groups may use other admin-local /64s (e.g., ff04:0:0:1::/64). const bit<128> IPV6_UNDERLAY_MASK = 0xffffffffffffffff0000000000000000; // /64 prefix mask const bit<128> IPV6_UNDERLAY_MULTICAST_PATTERN = 0xff040000000000000000000000000000; // ff04::/64 +#endif /* MULTICAST */ /* Reasons a packet may be dropped by the p4 pipeline */ const bit<8> DROP_IPV4_SWITCH_ADDR_MISS = 0x01; @@ -86,15 +128,21 @@ const bit<8> DROP_IPV4_UNROUTEABLE = 0x0F; const bit<8> DROP_IPV6_UNROUTEABLE = 0x10; const bit<8> DROP_NAT_INGRESS_MISS = 0x11; const bit<8> DROP_NAT_EGRESS_BLOCKED = 0x12; -const bit<8> DROP_MULTICAST_NO_GROUP = 0x13; const bit<8> DROP_MULTICAST_INVALID_MAC = 0x14; -const bit<8> DROP_MULTICAST_CPU_COPY = 0x15; -const bit<8> DROP_MULTICAST_SOURCE_FILTERED = 0x16; -const bit<8> DROP_MULTICAST_PATH_FILTERED = 0x17; const bit<8> DROP_GENEVE_OPTIONS_TOO_LONG = 0x18; const bit<8> DROP_GENEVE_OPTION_MALFORMED = 0x19; const bit<8> DROP_GENEVE_OPTION_UNKNOWN = 0x1A; const bit<8> DROP_SCTP = 0x1B; + +#ifdef MULTICAST +/* Multicast-only drop reasons. Codes 0x13 and 0x15-0x17 are skipped without + * MULTICAST. Values are preserved across builds to keep drop codes stable. + */ +const bit<8> DROP_MULTICAST_NO_GROUP = 0x13; +const bit<8> DROP_MULTICAST_CPU_COPY = 0x15; +const bit<8> DROP_MULTICAST_SOURCE_FILTERED = 0x16; +const bit<8> DROP_MULTICAST_PATH_FILTERED = 0x17; +#endif /* MULTICAST */ + // MAX(DROP_xxx) + 1 const bit<32> DROP_REASON_MAX = 0x1C; - diff --git a/dpd/p4/headers.p4 b/dpd/p4/headers.p4 index 48eb6c60..7ea534b2 100644 --- a/dpd/p4/headers.p4 +++ b/dpd/p4/headers.p4 @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company const bit<16> ETHERTYPE_IPV4 = 0x0800; const bit<16> ETHERTYPE_ARP = 0x0806; @@ -185,7 +185,6 @@ header geneve_opt_mcast_h { bit<30> reserved; } - header geneve_opt_mss_h { bit<32> mss; } diff --git a/dpd/p4/metadata.p4 b/dpd/p4/metadata.p4 index c609a269..b0fd2a22 100644 --- a/dpd/p4/metadata.p4 +++ b/dpd/p4/metadata.p4 @@ -4,12 +4,98 @@ // // Copyright 2026 Oxide Computer Company +// Guard against compiler bug: RemoveMetadataInits strips explicit `= false` +// initializations, assuming parser will zero-init the PHV container. +// ComputeInitZeroContainers only marks containers for zero-init if the field +// is actually used in the parser, not just initialized. These assumptions are +// incompatible: fields initialized but only used in MAU get stale data. +// See: https://github.com/oxidecomputer/tofino-p4c/blob/ry/upstream-merge/rydocs/tofino-metadata-corruption.md +@pa_no_init("ingress", "meta.service_routed") +@pa_no_init("ingress", "meta.nat_egress_hit") +@pa_no_init("ingress", "meta.nat_ingress_hit") +@pa_no_init("ingress", "meta.uplink_ingress") +@pa_no_init("ingress", "meta.encap_needed") +@pa_no_init("ingress", "meta.icmp_recalc") +@pa_no_init("ingress", "meta.allow_source_mcast") +@pa_no_init("ingress", "meta.resolve_nexthop") +@pa_no_init("ingress", "meta.nexthop_is_v6") +@pa_no_init("ingress", "meta.route_ttl_is_1") +@pa_no_init("ingress", "meta.skip_ttl_check") +// These fields are set in the parser on some paths but not all. On paths +// that skip the set, the field is init-only and vulnerable. +@pa_no_init("ingress", "meta.is_switch_address") +@pa_no_init("ingress", "meta.is_link_local_mcastv6") + +// Force fields out of mocha containers into normal containers. Mocha containers +// only support whole-container-set operations, so isolated fields can have +// their other bits corrupted by stale data from previous packets. +// +// Without these pragmas the compiler may pack small metadata fields into mocha +// containers alongside unrelated fields. A whole-container write to one field +// then clobbers the others. The risk is highest for 1-bit booleans and fields +// with long live-range gaps between set and use. +// +// Both builds share ipv4_checksum_err: confirmed allocated to mocha MH0 where +// it shared a container with pkt_type, risking false checksum-error drops. +@pa_container_type("ingress", "meta.ipv4_checksum_err", "normal") + +// 1-bit ingress booleans: high risk of mocha packing. The compiler can +// pack up to 8 booleans into a single 8-bit mocha container, and a +// whole-container write to any one clobbers the rest. +@pa_container_type("ingress", "meta.dropped", "normal") +@pa_container_type("ingress", "meta.is_switch_address", "normal") +@pa_container_type("ingress", "meta.is_mcast", "normal") +@pa_container_type("ingress", "meta.allow_source_mcast", "normal") +@pa_container_type("ingress", "meta.is_link_local_mcastv6", "normal") +@pa_container_type("ingress", "meta.service_routed", "normal") +@pa_container_type("ingress", "meta.nat_egress_hit", "normal") +@pa_container_type("ingress", "meta.nat_ingress_hit", "normal") +@pa_container_type("ingress", "meta.uplink_ingress", "normal") +@pa_container_type("ingress", "meta.encap_needed", "normal") +@pa_container_type("ingress", "meta.resolve_nexthop", "normal") +@pa_container_type("ingress", "meta.route_ttl_is_1", "normal") +@pa_container_type("ingress", "meta.skip_ttl_check", "normal") +@pa_container_type("ingress", "meta.nexthop_is_v6", "normal") +@pa_container_type("ingress", "meta.icmp_recalc", "normal") + +// Wider ingress fields used by NAT encapsulation, checksum computation, +// and routing. Protected in both builds to avoid relying on incidental +// co-location with deparsed fields, which is fragile across compiler +// versions and PHV pressure changes. +@pa_container_type("ingress", "meta.drop_reason", "normal") +@pa_container_type("ingress", "meta.l4_src_port", "normal") +@pa_container_type("ingress", "meta.l4_dst_port", "normal") +@pa_container_type("ingress", "meta.nat_ingress_tgt", "normal") +@pa_container_type("ingress", "meta.nat_geneve_vni", "normal") +@pa_container_type("ingress", "meta.nat_inner_mac", "normal") +@pa_container_type("ingress", "meta.icmp_csum", "normal") +@pa_container_type("ingress", "meta.body_checksum", "normal") +@pa_container_type("ingress", "meta.orig_src_mac", "normal") +@pa_container_type("ingress", "meta.orig_src_ipv4", "normal") +@pa_container_type("ingress", "meta.nat_ingress_csum", "normal") +@pa_container_type("ingress", "meta.nexthop", "normal") + +// Egress bridge header fields crossing the ingress/egress boundary. +@pa_container_type("egress", "meta.bridge_hdr.ingress_port", "normal") +@pa_container_type("egress", "meta.bridge_hdr.is_mcast_routed", "normal") +@pa_container_type("egress", "meta.bridge_hdr.nat_egress_hit", "normal") +// Egress drop_reason is used in the final drop/forward decision in both +// builds. In the MULTICAST build, additional egress fields are set by +// multicast table actions and consumed later in the pipeline. +@pa_container_type("egress", "meta.drop_reason", "normal") +@pa_container_type("egress", "meta.vlan_id", "normal") +@pa_container_type("egress", "meta.port_number", "normal") +@pa_container_type("egress", "meta.ipv4_checksum_recalc", "normal") + /* Flexible bridge header for passing metadata between ingress and egress * pipelines. */ @flexible header bridge_h { - PortId_t ingress_port; + PortId_t ingress_port; // 9 bits + bool is_mcast_routed; // 1 bit: packet was routed to multicast (PRE) + bool nat_egress_hit; // 1 bit: NAT egress matched, check egress filter + bit<5> reserved; // 5 bits: padding to 16-bit boundary } struct sidecar_ingress_meta_t { @@ -25,8 +111,10 @@ struct sidecar_ingress_meta_t { bool uplink_ingress; // Packet arrived on an uplink port bool encap_needed; bool resolve_nexthop; // signals nexthop needs to be resolved - ipv4_addr_t nexthop_ipv4; // ip address of next router - ipv6_addr_t nexthop_ipv6; // ip address of next router + bool route_ttl_is_1; // TTL/hop_limit equals 1 (for route lookup) + bool skip_ttl_check; // skip TTL=1 exception (service-port route) + bool nexthop_is_v6; // true when nexthop is IPv6 + ipv6_addr_t nexthop; // next hop address; IPv4 uses low bits bit<10> pkt_type; bit<8> drop_reason; // reason a packet was dropped bit<16> l4_src_port; // tcp or udp destination port @@ -74,41 +162,11 @@ struct sidecar_egress_meta_t { bit<8> port_number; // Port number for the outgoing port (0-255) } -struct route4_result_t { - /* - * The result of the multistage route selection process is an egress - * port and a nexthop address - */ - ipv4_addr_t nexthop; - ipv6_addr_t nexthop6; - PortId_t port; - - /* Did we successfully look up the route in the table? */ - bool is_hit; - bool is_v6; - - /* - * A hash of the (address,port) fields, which is used to choose between - * multiple potential routes. - */ - bit<8> ecmp_hash; - - /* Index into the target table of the first potential route */ - bit<16> idx; - /* Number of consecutive slots containing potential routes */ - bit<8> slots; - /* Which of those routes we should select, based the flow hash */ - bit<16> slot; -} - -struct route6_result_t { - /* - * The result of the multistage route selection process is an egress - * port and a nexthop address - */ - ipv6_addr_t nexthop; - PortId_t port; - +// Unified route result struct for both Router4 and Router6. +// A single instance is allocated in L3Router and passed to both +// controls, forcing the compiler to use the same PHV allocation +// and preventing live-range divergence under high PHV pressure. +struct route_result_t { /* Did we successfully look up the route in the table? */ bool is_hit; diff --git a/dpd/p4/parser.p4 b/dpd/p4/parser.p4 index 3a2618d0..38d80a9b 100644 --- a/dpd/p4/parser.p4 +++ b/dpd/p4/parser.p4 @@ -43,8 +43,8 @@ parser IngressParser( meta.l4_dst_port = 0; meta.l4_length = 0; meta.body_checksum = 0; - meta.nexthop_ipv4 = 0; - meta.nexthop_ipv6 = 0; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; meta.orig_src_mac = 0; meta.orig_src_ipv4 = 0; meta.orig_dst_ipv4 = 0; @@ -52,9 +52,12 @@ parser IngressParser( meta.drop_reason = 0; meta.nat_ingress_csum = 0; meta.resolve_nexthop = false; + meta.route_ttl_is_1 = false; meta.bridge_hdr.setValid(); meta.bridge_hdr.ingress_port = ig_intr_md.ingress_port; + meta.bridge_hdr.is_mcast_routed = false; + meta.bridge_hdr.reserved = 0; transition port_metadata; } @@ -246,8 +249,8 @@ parser IngressParser( }); transition select(hdr.ipv6.dst_addr[127:112]) { - 16w0xff01: drop_interface_local_mcast; - 16w0xff02: set_link_local_mcast; + IPV6_INTERFACE_LOCAL_16: drop_interface_local_mcast; + IPV6_LINK_LOCAL_16: set_link_local_mcast; default: check_ipv6_mcast; } } @@ -538,7 +541,6 @@ parser EgressParser( state meta_init { meta.drop_reason = 0; - meta.bridge_hdr.setInvalid(); meta.decap_ports_0 = 0; meta.decap_ports_1 = 0; @@ -554,14 +556,12 @@ parser EgressParser( meta.vlan_id = 0; meta.port_number = 0; - transition parse_bridge_hdr; } state parse_bridge_hdr { pkt.extract(bridge_hdr); meta.bridge_hdr = bridge_hdr; - meta.bridge_hdr.setValid(); transition parse_ethernet; } @@ -591,8 +591,6 @@ parser EgressParser( pkt.extract(hdr.ipv4); transition select(hdr.ipv4.protocol) { - IPPROTO_ICMP: parse_icmp; - IPPROTO_TCP: parse_tcp; IPPROTO_UDP: parse_udp; default: accept; } @@ -602,24 +600,11 @@ parser EgressParser( pkt.extract(hdr.ipv6); transition select(hdr.ipv6.next_hdr) { - IPPROTO_TCP: parse_tcp; IPPROTO_UDP: parse_udp; default: accept; } } - state parse_icmp { - pkt.extract(hdr.icmp); - - transition accept; - } - - state parse_tcp { - pkt.extract(_); - - transition accept; - } - state parse_udp { pkt.extract(hdr.udp); diff --git a/dpd/p4/route_selector.p4 b/dpd/p4/route_selector.p4 index 81590c72..1cc20eb1 100644 --- a/dpd/p4/route_selector.p4 +++ b/dpd/p4/route_selector.p4 @@ -2,8 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company - +// Copyright 2026 Oxide Computer Company action set_slot(bit<8> slot) { res.slot = (bit<16>) slot; diff --git a/dpd/p4/sidecar.p4 b/dpd/p4/sidecar.p4 index 07a694ae..58f543ad 100644 --- a/dpd/p4/sidecar.p4 +++ b/dpd/p4/sidecar.p4 @@ -48,6 +48,22 @@ const bit<9> USER_SPACE_SERVICE_PORT = 192; #define IPV6_FIELDS \ hdr.inner_ipv6 +// Common setup for sending ICMP error responses to user space. +// +// Sets up the sidecar header and routes to USER_SPACE_SERVICE_PORT. +// Callers should set meta.drop_reason and call counters as needed. +#define ICMP_ERROR_SETUP(type, code) \ + hdr.sidecar.sc_code = SC_ICMP_NEEDED; \ + hdr.sidecar.sc_pad = 0; \ + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; \ + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; \ + hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; \ + hdr.sidecar.sc_payload = (bit<128>)(type) << 8 | (bit<128>)(code); \ + hdr.sidecar.setValid(); \ + hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; \ + meta.service_routed = true; \ + ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT + // This control handles the calculation of Layer 4 payload length // by subtracting the IPv4 header length from the total packet length. // @@ -105,10 +121,6 @@ control Filter( ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv4_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv6_ctr; -#ifdef MULTICAST - Counter, PortId_t>(512, CounterType_t.PACKETS) drop_mcast_ctr; - bit<16> mcast_scope; -#endif /* MULTICAST */ action dropv4() { meta.drop_reason = DROP_IPV4_SWITCH_ADDR_MISS; @@ -122,11 +134,6 @@ control Filter( ipv6_ctr.count(); } - action drop_bad_mac() { - meta.drop_reason = DROP_MULTICAST_INVALID_MAC; - meta.dropped = true; - } - action claimv4() { meta.is_switch_address = true; ipv4_ctr.count(); @@ -137,7 +144,6 @@ control Filter( ipv6_ctr.count(); } - // Table of the IPv4 addresses assigned to ports on the switch. table switch_ipv4_addr { key = { @@ -203,16 +209,15 @@ control Filter( if (mac_byte4 != (bit<8>)ipv4_lower7 || mac_byte5 != ipv4_byte3 || mac_byte6 != ipv4_byte4) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } - } else { + } else +#endif /* MULTICAST */ + { switch_ipv4_addr.apply(); } -#else /* MULTICAST */ - switch_ipv4_addr.apply(); -#endif /* MULTICAST */ } else if (hdr.ipv6.isValid()) { #ifdef MULTICAST if (meta.is_mcast) { @@ -228,8 +233,8 @@ control Filter( // registers on the device. if (hdr.ethernet.dst_mac[47:40] != 8w0x33 || hdr.ethernet.dst_mac[39:32] != 8w0x33) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } @@ -242,14 +247,14 @@ control Filter( hdr.ethernet.dst_mac[23:16] != hdr.ipv6.dst_addr[23:16] || hdr.ethernet.dst_mac[15:8] != hdr.ipv6.dst_addr[15:8] || hdr.ethernet.dst_mac[7:0] != hdr.ipv6.dst_addr[7:0]) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } } #endif /* MULTICAST */ - if (!meta.is_mcast || meta.is_link_local_mcastv6 && !meta.encap_needed) { + if (!meta.is_mcast || (meta.is_link_local_mcastv6 && !meta.encap_needed)) { switch_ipv6_addr.apply(); } } @@ -316,8 +321,11 @@ control Services( // sidecar tag, which indicates which port the request arrived on. action forward_to_userspace() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = 0; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; + hdr.sidecar.sc_payload = 0; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -350,6 +358,7 @@ control Services( // packets always go to the port indicated by the sidecar header. action mcast_inbound_link_local() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; @@ -415,11 +424,21 @@ control Services( } apply { + // TODO: Could probably be simplified by hoisting the + // switch_ipv{4,6}_addr lookup earlier and dropping non-NAT + // switch-addressed packets in NatIngress. Not currently done because + // the current fall-through to service.apply() is how the control plane + // traffic to the switch addresses reaches userspace. if (meta.is_switch_address && hdr.geneve.isValid() && hdr.geneve.vni != 0) { meta.nat_egress_hit = true; } else { service.apply(); + // Detect link-local multicast for packets forwarded from userspace. + if (meta.service_routed && hdr.ipv6.isValid() && + hdr.ipv6.dst_addr[127:112] == IPV6_LINK_LOCAL_16) { + meta.is_link_local_mcastv6 = true; + } } } } @@ -620,7 +639,7 @@ control NatIngress ( hdr.vlan.vlan_id : exact; } actions = { mcast_forward_ipv4_to; } - const size = IPV4_MULTICAST_TABLE_SIZE; + const size = INGRESS_IPV4_MCAST_SIZE; counters = mcast_ipv4_ingress_ctr; } @@ -643,7 +662,7 @@ control NatIngress ( hdr.vlan.vlan_id : exact; } actions = { mcast_forward_ipv6_to; } - const size = IPV6_MULTICAST_TABLE_SIZE; + const size = INGRESS_IPV6_MCAST_SIZE; counters = mcast_ipv6_ingress_ctr; } #endif /* MULTICAST */ @@ -716,33 +735,28 @@ control NatIngress ( apply { icmp_dst_port.apply(); - // Note: This whole conditional could be simpler as a set of */ + // Note: This whole conditional could be simpler as a set of // `const entries`, but apply (on tables) cannot be called from actions -#ifdef MULTICAST if (hdr.ipv4.isValid()) { +#ifdef MULTICAST if (meta.is_mcast) { ingress_ipv4_mcast.apply(); - } else if (!meta.encap_needed) { + } else +#endif /* MULTICAST */ + if (!meta.encap_needed) { ingress_ipv4.apply(); } } else if (hdr.ipv6.isValid()) { - // If this is a multicast packet and not a link-local multicast, - // we need to check the multicast table +#ifdef MULTICAST + // If multicast and not link-local, check the multicast table if (meta.is_mcast && !meta.is_link_local_mcastv6) { ingress_ipv6_mcast.apply(); - } else { + } else +#endif /* MULTICAST */ + { ingress_ipv6.apply(); } } -#else /* MULTICAST */ - if (hdr.ipv4.isValid()) { - if (!meta.encap_needed) { - ingress_ipv4.apply(); - } - } else if (hdr.ipv6.isValid()) { - ingress_ipv6.apply(); - } -#endif /* MULTICAST */ if (ingress_hit.apply().hit) { if (hdr.ipv4.isValid()) { @@ -916,7 +930,10 @@ control NatEgress ( control RouterLookupIndex6( inout sidecar_headers_t hdr, - inout route6_result_t res + inout sidecar_ingress_meta_t meta, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t res ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_ctr; @@ -929,22 +946,27 @@ control RouterLookupIndex6( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop = nexthop; + ig_tm_md.ucast_egress_port = port; + hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward(PortId_t port, ipv6_addr_t nexthop) { - res.port = port; - res.nexthop = nexthop; + ig_tm_md.ucast_egress_port = port; + hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; forward_ctr.count(); } /* - * The table size is reduced by one here just to allow the integration - * test to pass. We want the lookup and forward tables to have the same - * capacity from dpd's perspective, and the "default" entry consumes a - * slot in the lookup table. + * Index 0 is reserved as the unreachable/miss sentinel set by + * `unreachable()`. The freemap allocates indices 1..IPV6_LPM_SIZE - 1, + * leaving slot 0 vacant so misses cleanly fall through. */ table route { key = { res.idx: exact; } @@ -958,8 +980,8 @@ control RouterLookupIndex6( res.idx = 0; res.slots = 0; res.slot = 0; - res.port = 0; - res.nexthop = 0; + ICMP_ERROR_SETUP(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + meta.drop_reason = DROP_IPV6_UNROUTEABLE; index_ctr.count(); } @@ -969,28 +991,24 @@ control RouterLookupIndex6( */ #include - action index(bit<16> idx, bit<8> slots) { + action index(bit<16> idx, bit<8> slots, bit<1> skip_ttl) { res.is_hit = true; - res.idx = idx; res.slots = slots; res.slot = 0; - - // The rest of this data is extracted from the target table at - // entry `res.idx`. - res.port = 0; - res.nexthop = 0; + meta.skip_ttl_check = (skip_ttl == 1); index_ctr.count(); } + /* + * Sized to IPV6_LPM_SIZE without padding. A +1 cushion crosses a + * hardware boundary in the v6 LPM and costs an ingress stage. + */ table lookup { key = { hdr.ipv6.dst_addr: lpm; } actions = { index; unreachable; } default_action = unreachable; - // The table size is incremented by one here just to allow the - // integration tests to pass, as this is used by the multicast - // implementation as well - const size = IPV6_LPM_SIZE + 1; + const size = IPV6_LPM_SIZE; counters = index_ctr; } @@ -1007,14 +1025,33 @@ control RouterLookupIndex6( */ select_route.apply(); res.idx = res.idx + res.slot; - route.apply(); + + if (meta.route_ttl_is_1 && !meta.skip_ttl_check) { + // TTL=1 short-circuit, inlined to avoid a dedicated + // table stage. Aggregate count surfaces through the + // DROP_IPV6_TTL_EXCEEDED drop-reason counter. + // + // Packets routed to the service port (skip_ttl_check=true) + // bypass this setup. They are not counted as TTL_EXCEEDED + // even if userspace later rejects them. If a "TTL=1 bypassed + // to userspace" signal becomes useful for diagnostics, add a + // dedicated counter rather than relaxing the bypass. + ICMP_ERROR_SETUP( + ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; + } else { + route.apply(); + } } } } control RouterLookupIndex4( inout sidecar_headers_t hdr, - inout route4_result_t res + inout sidecar_ingress_meta_t meta, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t res ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_ctr; @@ -1027,9 +1064,11 @@ control RouterLookupIndex4( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop = nexthop; - res.is_v6 = false; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = (ipv6_addr_t)nexthop; + meta.nexthop_is_v6 = false; + meta.resolve_nexthop = true; forward_ctr.count(); } @@ -1041,37 +1080,51 @@ control RouterLookupIndex4( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop6 = nexthop; - res.is_v6 = true; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward(PortId_t port, ipv4_addr_t nexthop) { - res.port = port; - res.nexthop = nexthop; - res.is_v6 = false; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = (ipv6_addr_t)nexthop; + meta.nexthop_is_v6 = false; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward_v6(PortId_t port, ipv6_addr_t nexthop) { - res.port = port; - res.nexthop6 = nexthop; - res.is_v6 = true; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; + forward_ctr.count(); + } + + action ttl_exceeded() { + ICMP_ERROR_SETUP(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV4_TTL_EXCEEDED; forward_ctr.count(); } /* - * The table size is reduced by one here just to allow the integration - * test to pass. We want the lookup and forward tables to have the same - * capacity from dpd's perspective, and the "default" entry consumes a - * slot in the lookup table. + * IPv4 still uses the older compound-key encoding: each logical route + * consumes two physical entries keyed by `(idx, route_ttl_is_1)`, one + * for normal forwarding and one for the ttl_exceeded path. IPv6 was + * collapsed to one entry per logical route by moving TTL=1 handling + * inline in the apply block. IPv4 keeps the two-entry form because it + * still needs per-target TTL behavior for mixed ECMP sets. */ table route { - key = { res.idx: exact; } - actions = { forward; forward_v6; forward_vlan; forward_vlan_v6; } - const size = IPV4_LPM_SIZE - 1; - counters = forward_ctr; + key = { res.idx: exact; meta.route_ttl_is_1: exact; } + actions = { forward; forward_v6; forward_vlan; forward_vlan_v6; ttl_exceeded; } + const size = IPV4_LPM_SIZE * FWD_ENTRIES_PER_ROUTE_V4 - 1; + counters = forward_ctr; } action unreachable() { @@ -1079,8 +1132,8 @@ control RouterLookupIndex4( res.idx = 0; res.slots = 0; res.slot = 0; - res.port = 0; - res.nexthop = 0; + ICMP_ERROR_SETUP(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); + meta.drop_reason = DROP_IPV4_UNROUTEABLE; index_ctr.count(); } @@ -1092,15 +1145,9 @@ control RouterLookupIndex4( action index(bit<16> idx, bit<8> slots) { res.is_hit = true; - res.idx = idx; res.slots = slots; res.slot = 0; - - // The rest of this data is extracted from the target table at - // entry `res.idx`. - res.port = 0; - res.nexthop = 0; index_ctr.count(); } @@ -1153,10 +1200,11 @@ control Arp ( action request() { hdr.sidecar.sc_code = SC_ARP_NEEDED; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv4; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -1168,7 +1216,11 @@ control Arp ( } table tbl { - key = { meta.nexthop_ipv4: exact; } + // @name required for complex key expressions (casts); provides + // the control-plane name used by Rust match_xlate. + key = { + (ipv4_addr_t)meta.nexthop : exact @name("nexthop"); + } actions = { drop; request; rewrite; } default_action = request; const size = IPV4_ARP_SIZE; @@ -1201,10 +1253,11 @@ control Ndp ( action request() { hdr.sidecar.sc_code = SC_NEIGHBOR_NEEDED; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv6; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -1216,7 +1269,7 @@ control Ndp ( } table tbl { - key = { meta.nexthop_ipv6: exact; } + key = { meta.nexthop: exact; } actions = { drop; rewrite; request; } default_action = request; const size = IPV6_NEIGHBOR_SIZE; @@ -1230,33 +1283,22 @@ control Router4 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, in ingress_intrinsic_metadata_t ig_intr_md, - inout ingress_intrinsic_metadata_for_tm_t ig_tm_md + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t fwd ) { RouterLookupIndex4() lookup_idx; Hash>(HashAlgorithm_t.CRC8) index_hash; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - apply { - route4_result_t fwd; - fwd.is_v6 = false; - fwd.nexthop6 = 0; - fwd.nexthop = 0; - fwd.port = 0; + // fwd is passed in from L3Router to share PHV allocation with Router6 fwd.is_hit = false; fwd.idx = 0; fwd.slots = 0; fwd.slot = 0; + meta.resolve_nexthop = false; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; + meta.route_ttl_is_1 = hdr.ipv4.ttl == 1; // Our route selection table is 11 bits wide, and we need 5 bits // of that for our "slot count" index. Thus, we only need 6 // bits of the 8-bit hash calculated here to complete the 11-bit @@ -1268,26 +1310,7 @@ control Router4 ( meta.l4_src_port }) & 0x3f; - lookup_idx.apply(hdr, fwd); - - if (!fwd.is_hit) { - icmp_error(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); - // Dont set meta.dropped because we want an error packet - // to go out. - meta.drop_reason = DROP_IPV4_UNROUTEABLE; - } else if (hdr.ipv4.ttl == 1 && !IS_SERVICE(fwd.port)) { - icmp_error(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); - // Dont set meta.dropped because we want an error packet - // to go out. - meta.drop_reason = DROP_IPV4_TTL_EXCEEDED; - } else { - hdr.ipv4.ttl = hdr.ipv4.ttl - 1; - ig_tm_md.ucast_egress_port = fwd.port; - - meta.nexthop_ipv4 = fwd.nexthop; - meta.nexthop_ipv6 = fwd.nexthop6; - meta.resolve_nexthop = true; - } + lookup_idx.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } @@ -1300,18 +1323,6 @@ control MulticastRouter4( ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - action unreachable() { ctr.count(); } @@ -1337,7 +1348,7 @@ control MulticastRouter4( } actions = { forward; forward_vlan; unreachable; } default_action = unreachable; - const size = IPV4_MULTICAST_TABLE_SIZE; + const size = MCAST_ROUTER_IPV4_SIZE; counters = ctr; } @@ -1353,12 +1364,12 @@ control MulticastRouter4( } if (!tbl.apply().hit) { - icmp_error(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); + ICMP_ERROR_SETUP(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); meta.drop_reason = DROP_IPV4_UNROUTEABLE; // Dont set meta.dropped because we want an error packet // to go out. } else if (hdr.ipv4.ttl == 1 && !meta.service_routed) { - icmp_error(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); + ICMP_ERROR_SETUP(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); meta.drop_reason = DROP_IPV4_TTL_INVALID; // Dont set meta.dropped because we want an error packet // to go out. @@ -1377,31 +1388,22 @@ control Router6 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, in ingress_intrinsic_metadata_t ig_intr_md, - inout ingress_intrinsic_metadata_for_tm_t ig_tm_md + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t fwd ) { RouterLookupIndex6() lookup_idx; Hash>(HashAlgorithm_t.CRC8) index_hash; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - apply { - route6_result_t fwd; - fwd.nexthop = 0; - fwd.port = 0; + // fwd is passed in from L3Router to share PHV allocation with Router4 fwd.is_hit = false; fwd.idx = 0; fwd.slots = 0; fwd.slot = 0; + meta.resolve_nexthop = false; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; + meta.route_ttl_is_1 = hdr.ipv6.hop_limit == 1; // Our route selection table is 11 bits wide, and we need 5 bits // of that for our "slot count" index. Thus, we only need 6 // bits of the 8-bit hash calculated here to complete the 11-bit @@ -1413,24 +1415,7 @@ control Router6 ( meta.l4_src_port }) & 0x3f; - lookup_idx.apply(hdr, fwd); - - if (!fwd.is_hit) { - icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); - meta.drop_reason = DROP_IPV6_UNROUTEABLE; - // Dont set meta.dropped because we want an error packet - // to go out. - } else if (hdr.ipv6.hop_limit == 1 && !IS_SERVICE(fwd.port)) { - icmp_error(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); - meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; - // Dont set meta.dropped because we want an error packet - // to go out. - } else { - ig_tm_md.ucast_egress_port = fwd.port; - hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; - meta.resolve_nexthop = true; - meta.nexthop_ipv6 = fwd.nexthop; - } + lookup_idx.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } @@ -1443,18 +1428,6 @@ control MulticastRouter6 ( ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - action unreachable() { ctr.count(); } @@ -1479,7 +1452,7 @@ control MulticastRouter6 ( } actions = { forward; forward_vlan; unreachable; } default_action = unreachable; - const size = IPV6_MULTICAST_TABLE_SIZE; + const size = MCAST_ROUTER_IPV6_SIZE; counters = ctr; } @@ -1495,12 +1468,12 @@ control MulticastRouter6 ( } if (!tbl.apply().hit) { - icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + ICMP_ERROR_SETUP(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); meta.drop_reason = DROP_IPV6_UNROUTEABLE; // Dont set meta.dropped because we want an error packet // to go out. } else if (hdr.ipv6.hop_limit == 1) { - icmp_error(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); + ICMP_ERROR_SETUP(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; // Dont set meta.dropped because we want an error packet // to go out. @@ -1519,59 +1492,65 @@ control L3Router( in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { + Router4() router4; + Router6() router6; + apply { -#ifdef MULTICAST + // Shared: allocate a single route_result_t for Router4 and Router6. + // This forces the compiler to use the same PHV allocation for both, + // preventing live-range divergence under high PHV pressure. + route_result_t fwd; + fwd.is_hit = false; + fwd.ecmp_hash = 0; + fwd.idx = 0; + fwd.slots = 0; + fwd.slot = 0; + if (hdr.ipv4.isValid()) { +#ifdef MULTICAST if (meta.is_mcast && !meta.is_link_local_mcastv6) { MulticastRouter4.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { - Router4.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else +#endif /* MULTICAST */ + { + router4.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } else if (hdr.ipv6.isValid()) { +#ifdef MULTICAST if (meta.is_mcast && !meta.is_link_local_mcastv6) { MulticastRouter6.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { - Router6.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else +#endif /* MULTICAST */ + { + router6.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } -#else /* MULTICAST */ - if (hdr.ipv4.isValid()) { - Router4.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else if (hdr.ipv6.isValid()) { - Router6.apply(hdr, meta, ig_intr_md, ig_tm_md); - } -#endif /* MULTICAST */ if (meta.resolve_nexthop) { - if (meta.nexthop_ipv4 != 0) { - Arp.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { + if (meta.nexthop_is_v6) { Ndp.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else { + Arp.apply(hdr, meta, ig_intr_md, ig_tm_md); } } } } -/* - * XXX: this control could be moved to the Egress pipeline if we need more space - * in the Ingress pipeline. Currently unicast packets are able to bypass that - * pipeline, which is why we've tacked it on here. We could probably also merge - * it with the MacRewrite control, as they are both per-port settings, but that - * would present some weird semantics to the control plane daemon. - */ -control EgressFilter( - inout sidecar_ingress_meta_t meta, - in ingress_intrinsic_metadata_for_tm_t ig_tm_md +// Filter NAT egress traffic by port. Ports not explicitly marked as uplinks +// drop guest traffic to prevent NAT'd packets from egressing on non-uplink +// ports. Placed in the egress pipeline to avoid adding a stage to ingress. +control NatEgressFilter( + inout sidecar_egress_meta_t meta, + in egress_intrinsic_metadata_t eg_intr_md ) { action guest_traffic_not_allowed() { meta.drop_reason = DROP_NAT_EGRESS_BLOCKED; - meta.dropped = true; } action guest_traffic_allowed() { } table egress_filter { - key = { ig_tm_md.ucast_egress_port : exact; } + key = { eg_intr_md.egress_port : exact; } actions = { guest_traffic_allowed; guest_traffic_not_allowed; } const size = 256; @@ -1583,39 +1562,14 @@ control EgressFilter( } } -control MacRewrite( - inout sidecar_headers_t hdr, - in PortId_t port -) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - - action rewrite(mac_addr_t mac) { - hdr.ethernet.src_mac = mac; - ctr.count(); - } - - table mac_rewrite { - key = { port: exact ; } - actions = { rewrite; } - - const size = 256; - counters = ctr; - } - apply { - mac_rewrite.apply(); - } -} - -#ifdef MULTICAST -/* This control is used to rewrite the source and destination MAC addresses - * for multicast packets. The destination MAC address is derived from the - * destination IP address, and the source MAC address is set based on the - * egress port the packet is being sent out on. +/* Rewrite the source MAC address based on the egress port. For multicast + * packets, also derive the destination MAC from the destination IP address. */ -control MulticastMacRewrite( +control MacRewrite( inout sidecar_headers_t hdr, - in PortId_t port + in PortId_t port, + in bool derive_dst_mac ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; @@ -1633,46 +1587,32 @@ control MulticastMacRewrite( } apply { - if (mac_rewrite.apply().hit) { + bool hit = mac_rewrite.apply().hit; + // Derive multicast dst_mac only when src_mac rewrite succeeds. + if (hit && derive_dst_mac) { // Derive the destination MAC based on IP type. // IPV4: https://www.rfc-editor.org/rfc/rfc1112.html#section-6.4 // IPV6: https://www.rfc-editor.org/rfc/rfc2464.html - if (hdr.ipv4.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv4.isValid())) { + if (hdr.ipv4.isValid()) { // IPv4 multicast MAC address (01:00:5e + 23 bits of IP) - bit<48> mcast_mac = 0; - // Set the first three bytes to 01:00:5e (0x01005e) - mcast_mac = (bit<48>)0x01005e << 24; - - bit<24> ip_suffix; - // Take the last 23 bits of IPv4 address and append them - // We mask the first byte to clear the top bit - if (hdr.ipv4.isValid()) { - ip_suffix = (bit<24>)(hdr.ipv4.dst_addr & 0x007fffff); - } else { - ip_suffix = (bit<24>)(hdr.inner_ipv4.dst_addr & 0x007fffff); - } - - hdr.ethernet.dst_mac = mcast_mac | ((bit<48>)ip_suffix); - } else if (hdr.ipv6.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv6.isValid())) { + hdr.ethernet.dst_mac = + IPV4_MCAST_MAC_PREFIX ++ 1w0 ++ hdr.ipv4.dst_addr[22:0]; + } else if (!hdr.geneve.isValid() && hdr.inner_ipv4.isValid()) { + hdr.ethernet.dst_mac = + IPV4_MCAST_MAC_PREFIX ++ 1w0 ++ hdr.inner_ipv4.dst_addr[22:0]; + } else if (hdr.ipv6.isValid()) { // IPv6 multicast MAC address (33:33 + last 32 bits of IPv6) - bit<48> mcast_mac = 0; - // Set the first two bytes to 33:33 - mcast_mac = (bit<48>)0x3333 << 32; - - bit<48> ip_suffix; - // Take the last 32 bits of IPv6 address and append them - if (hdr.ipv6.isValid()) { - ip_suffix = (bit<48>)(hdr.ipv6.dst_addr[31:0]); - } else { - ip_suffix = (bit<48>)(hdr.inner_ipv6.dst_addr[31:0]); - } - - hdr.ethernet.dst_mac = mcast_mac | ip_suffix; + hdr.ethernet.dst_mac = + IPV6_MCAST_MAC_PREFIX ++ hdr.ipv6.dst_addr[31:0]; + } else if (!hdr.geneve.isValid() && hdr.inner_ipv6.isValid()) { + hdr.ethernet.dst_mac = + IPV6_MCAST_MAC_PREFIX ++ hdr.inner_ipv6.dst_addr[31:0]; } } } } +#ifdef MULTICAST /* This control is used to configure multicast packets for replication. * It includes actions for dropping packets with no group, allowing * source-specific multicast, and configuring multicast group IDs and hashes. @@ -1724,7 +1664,7 @@ control MulticastIngress ( mcast_ipv4_ssm_ctr.count(); } - // Drop action for IPv6 ulticast packets with no source-specific multicast + // Drop action for IPv6 multicast packets with no source-specific multicast // group. action drop_mcastv6_filtered_source() { meta.drop_reason = DROP_MULTICAST_SOURCE_FILTERED; @@ -1788,7 +1728,7 @@ control MulticastIngress ( drop_mcastv4_filtered_source; } default_action = drop_mcastv4_filtered_source; - const size = IPV4_MULTICAST_TABLE_SIZE; + const size = MCAST_SOURCE_FILTER_IPV4_SIZE; counters = mcast_ipv4_ssm_ctr; } @@ -1799,7 +1739,7 @@ control MulticastIngress ( drop_mcastv6_admin_scoped_no_group; } default_action = drop_mcastv6_admin_scoped_no_group; - const size = IPV6_MULTICAST_TABLE_SIZE; + const size = MCAST_REPLICATION_IPV6_SIZE; counters = mcast_ipv6_ctr; } @@ -1813,7 +1753,7 @@ control MulticastIngress ( drop_mcastv6_filtered_source; } default_action = drop_mcastv6_filtered_source; - const size = IPV6_MULTICAST_TABLE_SIZE; + const size = MCAST_SOURCE_FILTER_IPV6_SIZE; counters = mcast_ipv6_ssm_ctr; } @@ -1851,16 +1791,18 @@ control MulticastIngress ( NoAction; } + // Priority order: first match wins. The geneve tag entries are + // most specific (both headers valid + exact tag), followed by + // group-ID fallbacks for non-geneve multicast. const entries = { ( _, _, true, true, MULTICAST_TAG_EXTERNAL ) : invalidate_underlay_grp_and_set_decap; ( _, _, true, true, MULTICAST_TAG_UNDERLAY ) : invalidate_external_grp; ( _, _, true, true, MULTICAST_TAG_UNDERLAY_EXTERNAL ) : NoAction; ( 0, _, _, _, _ ) : invalidate_external_grp; ( _, 0, _, _, _ ) : invalidate_underlay_grp; - ( 0, 0, _, _, _ ) : invalidate_grps; } - const size = 6; + const size = 5; } // Note: SSM tables currently take one extra stage in the pipeline (17->18). @@ -1987,7 +1929,7 @@ control MulticastEgress ( } // Group RIDs == Group IPs - const size = IPV6_MULTICAST_TABLE_SIZE; + const size = MCAST_DECAP_PORTS_SIZE; } action set_port_number(bit<8> port_number) { @@ -2126,14 +2068,11 @@ control Ingress( NatIngress() nat_ingress; NatEgress() nat_egress; L3Router() l3_router; - EgressFilter() egress_filter; #ifdef MULTICAST MulticastIngress() mcast_ingress; #endif /* MULTICAST */ - MacRewrite() mac_rewrite; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) ingress_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) egress_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS) drop_port_ctr; Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; Counter, bit<10>>(1024, CounterType_t.PACKETS) packet_ctr; @@ -2167,7 +2106,7 @@ control Ingress( if (!meta.is_mcast || meta.is_link_local_mcastv6) { services.apply(hdr, meta, ig_intr_md, ig_tm_md); #ifdef MULTICAST - } else if (meta.is_mcast && !meta.is_link_local_mcastv6) { + } else { mcast_ingress.apply(hdr, meta, ig_intr_md, ig_tm_md); #endif /* MULTICAST */ } @@ -2180,26 +2119,25 @@ control Ingress( if (!meta.dropped) { l3_router.apply(hdr, meta, ig_intr_md, ig_tm_md); } - if (!meta.dropped && meta.nat_egress_hit && - !meta.is_mcast && !meta.service_routed) { - egress_filter.apply(meta, ig_tm_md); - } } if (meta.dropped) { - // Handle dropped packets + // Handle dropped packets. Unicast packets proceed to egress for + // MAC rewrite and are counted by unicast_ctr in Egress for + // consistency. ig_dprsr_md.drop_ctl = 1; drop_port_ctr.count(ig_intr_md.ingress_port); drop_reason_ctr.count(meta.drop_reason); - } else if (!meta.is_mcast) { - egress_ctr.count(ig_tm_md.ucast_egress_port); - if (ig_tm_md.ucast_egress_port != USER_SPACE_SERVICE_PORT) { - mac_rewrite.apply(hdr, ig_tm_md.ucast_egress_port); - } - meta.bridge_hdr.setInvalid(); - ig_tm_md.bypass_egress = 1w1; } + // Pass state to egress via bridge header. + if (meta.is_mcast && !meta.is_link_local_mcastv6) { + meta.bridge_hdr.is_mcast_routed = true; + } else { + meta.bridge_hdr.is_mcast_routed = false; + } + meta.bridge_hdr.nat_egress_hit = meta.nat_egress_hit; + if (meta.encap_needed) { // This works around a few things which cropped up in // supporting several concurrent Geneve options: @@ -2327,70 +2265,129 @@ control Egress( inout egress_intrinsic_metadata_for_deparser_t eg_dprsr_md, inout egress_intrinsic_metadata_for_output_port_t eg_oport_md ) { + // Separate MacRewrite instances for unicast and multicast paths. + // The P4 compiler requires each table to have a single deterministic + // next-table chain. Using one instance from multiple control flow + // paths causes "incompatible next-table chains" errors. Separate + // instances also provide distinct DirectCounters for traffic accounting. + NatEgressFilter() egress_filter; + MacRewrite() unicast_mac_rewrite; #ifdef MULTICAST - MulticastMacRewrite() mac_rewrite; + MacRewrite() mcast_mac_rewrite; MulticastEgress() mcast_egress; +#endif /* MULTICAST */ + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) forwarded_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) unicast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) mcast_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) link_local_mcast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) external_mcast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) underlay_mcast_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS) drop_port_ctr; Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; +#ifdef MULTICAST + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) external_mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) underlay_mcast_ctr; +#endif /* MULTICAST */ apply { - // Check multicast egress packets by checking that RID is not 0. - bool is_egress_rid_mcast = eg_intr_md.egress_rid > 0; - // We track IPv6 multicast packets separately for counters. + // Link-local IPv6 multicast: ff02::/16 scope prefix. bool is_link_local_ipv6_mcast = false; if (hdr.ipv6.isValid()) { - bit<16> ipv6_prefix = (bit<16>)hdr.ipv6.dst_addr[127:112]; - is_link_local_ipv6_mcast = (ipv6_prefix == 16w0xff02); + if (hdr.ipv6.dst_addr[127:112] == IPV6_LINK_LOCAL_16) { + is_link_local_ipv6_mcast = true; + } + } + +#ifdef MULTICAST + // Multicast state from bridge header. + PortId_t ingress_port = meta.bridge_hdr.ingress_port; + bit<2> mcast_tag; + if (hdr.geneve_opts.oxg_mcast.isValid()) { + mcast_tag = hdr.geneve_opts.oxg_mcast.mcast_tag; + } else { + mcast_tag = MULTICAST_TAG_INVALID; } - bool is_mcast = is_egress_rid_mcast || is_link_local_ipv6_mcast; + bool is_mcast_routed = meta.bridge_hdr.is_mcast_routed; - if (is_egress_rid_mcast == true) { - if (meta.bridge_hdr.ingress_port == eg_intr_md.egress_port) { - // If the ingress port is the same as the egress port, drop - // the packet + if (eg_intr_md.egress_rid != 0) { + // Replicated multicast packet (egress_rid > 0 from PRE). + if (ingress_port == eg_intr_md.egress_port) { + // Drop if ingress port equals egress port (path filter). meta.drop_reason = DROP_MULTICAST_PATH_FILTERED; } else { mcast_egress.apply(hdr, meta, eg_intr_md, eg_dprsr_md); - mac_rewrite.apply(hdr, eg_intr_md.egress_port); + mcast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, true); } - } else if (eg_intr_md.egress_rid == 0 && - eg_intr_md.egress_rid_first == 1) { - // Drop CPU copies (RID=0) to prevent unwanted packets on port 0 + } else if (is_mcast_routed) { + // CPU copy: routed to multicast but egress_rid == 0. + eg_dprsr_md.drop_ctl = 1; meta.drop_reason = DROP_MULTICAST_CPU_COPY; + } else { + // Unicast: check egress filter, then rewrite src_mac. + if (meta.bridge_hdr.nat_egress_hit) { + egress_filter.apply(meta, eg_intr_md); + } + if (meta.drop_reason == 0 && + eg_intr_md.egress_port != USER_SPACE_SERVICE_PORT && + !is_link_local_ipv6_mcast) { + unicast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, false); + } } +#else /* MULTICAST */ + // Non-multicast: check egress filter for NAT traffic, + // then rewrite src_mac. + if (meta.bridge_hdr.nat_egress_hit) { + egress_filter.apply(meta, eg_intr_md); + } + if (meta.drop_reason == 0 && + eg_intr_md.egress_port != USER_SPACE_SERVICE_PORT && + !is_link_local_ipv6_mcast) { + unicast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, false); + } +#endif /* MULTICAST */ + // Shared: drop and forwarded counting if (meta.drop_reason != 0) { - // Handle dropped packets drop_port_ctr.count(eg_intr_md.egress_port); drop_reason_ctr.count(meta.drop_reason); eg_dprsr_md.drop_ctl = 1; - } else if (is_mcast == true) { - mcast_ctr.count(eg_intr_md.egress_port); + } else { + forwarded_ctr.count(eg_intr_md.egress_port); +#ifdef MULTICAST + // Multicast-specific counting. Use the mcast_tag + // local (captured before egress decap may strip + // geneve headers) rather than rechecking header + // validity. + if (is_mcast_routed) { + mcast_ctr.count(eg_intr_md.egress_port); + if (mcast_tag == MULTICAST_TAG_UNDERLAY) { + underlay_mcast_ctr.count( + eg_intr_md.egress_port); + } else if (mcast_tag == MULTICAST_TAG_EXTERNAL) { + external_mcast_ctr.count( + eg_intr_md.egress_port); + } else if (mcast_tag == MULTICAST_TAG_UNDERLAY_EXTERNAL) { + underlay_mcast_ctr.count( + eg_intr_md.egress_port); + external_mcast_ctr.count( + eg_intr_md.egress_port); + } + } else if (is_link_local_ipv6_mcast) { + mcast_ctr.count(eg_intr_md.egress_port); + link_local_mcast_ctr.count(eg_intr_md.egress_port); + } else { + unicast_ctr.count(eg_intr_md.egress_port); + } +#else /* MULTICAST */ + // Non-multicast counter increments if (is_link_local_ipv6_mcast) { link_local_mcast_ctr.count(eg_intr_md.egress_port); - } else if (hdr.geneve.isValid()) { - external_mcast_ctr.count(eg_intr_md.egress_port); - } else if (hdr.geneve.isValid() && - hdr.geneve_opts.oxg_mcast.isValid() && - hdr.geneve_opts.oxg_mcast.mcast_tag == MULTICAST_TAG_UNDERLAY) { - underlay_mcast_ctr.count(eg_intr_md.egress_port); + } else { + unicast_ctr.count(eg_intr_md.egress_port); } - } else { - // non-multicast packets should bypass the egress - // pipeline, so we would expect this to be 0. - unicast_ctr.count(eg_intr_md.egress_port); +#endif /* MULTICAST */ } } -#else /* MULTICAST */ - apply { } -#endif /* MULTICAST */ } control EgressDeparser( @@ -2421,9 +2418,9 @@ control EgressDeparser( } pkt.emit(hdr); } -#else +#else /* MULTICAST */ apply { pkt.emit(hdr); } -#endif +#endif /* MULTICAST */ } Pipeline( diff --git a/dpd/src/counters.rs b/dpd/src/counters.rs index c1c46dfc..2f20b112 100644 --- a/dpd/src/counters.rs +++ b/dpd/src/counters.rs @@ -50,23 +50,22 @@ pub fn get_counter_ids() -> Vec { let mut base = vec![ CounterId::Service, CounterId::Ingress, - CounterId::Egress, CounterId::Packet, CounterId::DropPort, CounterId::DropReason, + CounterId::Forwarded, + CounterId::Unicast, + CounterId::MulticastLL, + CounterId::EgressDropPort, + CounterId::EgressDropReason, ]; let mut multicast; #[cfg(feature = "multicast")] { multicast = vec![ - CounterId::Multicast(MulticastCounterId::EgressDropPort), - CounterId::Multicast(MulticastCounterId::EgressDropReason), - CounterId::Multicast(MulticastCounterId::Unicast), CounterId::Multicast(MulticastCounterId::Multicast), CounterId::Multicast(MulticastCounterId::MulticastExt), - CounterId::Multicast(MulticastCounterId::MulticastLL), CounterId::Multicast(MulticastCounterId::MulticastUL), - CounterId::Multicast(MulticastCounterId::MulticastDrop), ]; } #[cfg(not(feature = "multicast"))] @@ -325,24 +324,21 @@ pub async fn get_values( let key = match counter_id { CounterId::Packet => packet_label(idx.idx), CounterId::Service => service_label(idx.idx as u8), - CounterId::Ingress | CounterId::Egress | CounterId::DropPort => { - port_label(switch, idx.idx).await + CounterId::Ingress + | CounterId::DropPort + | CounterId::Forwarded + | CounterId::Unicast + | CounterId::MulticastLL + | CounterId::EgressDropPort => port_label(switch, idx.idx).await, + CounterId::DropReason | CounterId::EgressDropReason => { + reason_label(idx.idx as u8)? } - CounterId::DropReason => reason_label(idx.idx as u8)?, #[cfg(feature = "multicast")] - CounterId::Multicast(MulticastCounterId::EgressDropPort) - | CounterId::Multicast(MulticastCounterId::Unicast) - | CounterId::Multicast(MulticastCounterId::Multicast) + CounterId::Multicast(MulticastCounterId::Multicast) | CounterId::Multicast(MulticastCounterId::MulticastExt) - | CounterId::Multicast(MulticastCounterId::MulticastLL) - | CounterId::Multicast(MulticastCounterId::MulticastUL) - | CounterId::Multicast(MulticastCounterId::MulticastDrop) => { + | CounterId::Multicast(MulticastCounterId::MulticastUL) => { port_label(switch, idx.idx).await } - #[cfg(feature = "multicast")] - CounterId::Multicast(MulticastCounterId::EgressDropReason) => { - reason_label(idx.idx as u8)? - } }; if let Some(key) = key { diff --git a/dpd/src/freemap.rs b/dpd/src/freemap.rs index a782d585..e8d53c06 100644 --- a/dpd/src/freemap.rs +++ b/dpd/src/freemap.rs @@ -83,6 +83,8 @@ pub struct FreeMap { log: slog::Logger, // Has the FreeMap been initialized yet? initted: bool, + // Lowest allocatable slot in the managed range + low: u16, // Size of the range being managed size: u16, // Caches of freed ranges, collected by size. The contents of a bin are not @@ -104,6 +106,7 @@ impl FreeMap { FreeMap { log, initted: false, + low: 0, size: 0, recycle_bins: BTreeMap::new(), freelist: Vec::new(), @@ -112,9 +115,16 @@ impl FreeMap { // Initialize the FreeMap if it hasn't already been initialized pub fn maybe_init(&mut self, size: u16) { + self.maybe_init_with_low(0, size); + } + + // Initialize the FreeMap if it hasn't already been initialized, allowing + // callers to reserve a prefix of the index space. + pub fn maybe_init_with_low(&mut self, low: u16, size: u16) { if !self.initted { - debug!(self.log, "initted freemap. size: {size}"); + debug!(self.log, "initted freemap. low: {low}, size: {size}"); self.initted = true; + self.low = low; self.size = size; self.reset(); } @@ -126,7 +136,7 @@ impl FreeMap { pub fn reset(&mut self) { debug!(self.log, "reset freemap"); self.recycle_bins = BTreeMap::new(); - self.freelist = vec![Span::new(0, self.size)]; + self.freelist = vec![Span::new(self.low, self.low + self.size)]; } // Allocate a range of the given size. On success, this call returns the diff --git a/dpd/src/link.rs b/dpd/src/link.rs index 3834a69d..ac6d4cb4 100644 --- a/dpd/src/link.rs +++ b/dpd/src/link.rs @@ -1617,6 +1617,7 @@ fn set_mac_config( mac::mcast_mac_set(switch, asic_id, mac)?; mcast::mcast_egress::add_port_mapping_entry(switch, asic_id)?; } + Ok(()) } @@ -1628,6 +1629,7 @@ fn clear_mac_config(switch: &Switch, asic_id: AsicId) -> DpdResult<()> { mac::mcast_mac_clear(switch, asic_id)?; mcast::mcast_egress::del_port_mapping_entry(switch, asic_id)?; } + Ok(()) } diff --git a/dpd/src/macaddrs.rs b/dpd/src/macaddrs.rs index 7acff4e6..79945597 100644 --- a/dpd/src/macaddrs.rs +++ b/dpd/src/macaddrs.rs @@ -427,7 +427,7 @@ impl Switch { assert_eq!(mgr.set_base_mac(temp_mac)?, None); } - // Reset ingress and egress MAC tables and Port ID table(s). + // Reset egress MAC table and Port ID table(s). mac::reset(self)?; #[cfg(feature = "multicast")] { diff --git a/dpd/src/main.rs b/dpd/src/main.rs index a2b37ddc..cad4cbf3 100644 --- a/dpd/src/main.rs +++ b/dpd/src/main.rs @@ -285,7 +285,8 @@ impl Switch { &log, config.asic_config.xcvr_iface.as_deref(), ); - let route_data = route::init(&log); + let route_data = route::init(&log, &asic_hdl) + .context("failed to initialize route state")?; let mac_mgmt = Mutex::new(macaddrs::MacManagement::new(&log)); #[cfg(feature = "tofino_asic")] diff --git a/dpd/src/mcast/mod.rs b/dpd/src/mcast/mod.rs index 5a685415..4417d8ae 100644 --- a/dpd/src/mcast/mod.rs +++ b/dpd/src/mcast/mod.rs @@ -457,9 +457,8 @@ pub(crate) fn add_group_internal( let underlay_group_id = scoped_underlay_id.id(); let mut added_members = Vec::new(); - // Only configure replication if there are members let replication_info = if !group_info.members.is_empty() { - let replication_info = configure_replication(external_group_id); + let replication_info = build_replication_info(external_group_id); add_ports_to_groups( s, @@ -826,7 +825,6 @@ pub(crate) fn modify_group_internal( ); let sources = None; - // Configure replication based on member count transitions let replication_info = match ( new_group_info.members.is_empty(), &group_entry.replication_info, @@ -856,17 +854,10 @@ pub(crate) fn modify_group_internal( None } (false, None) => { - // Transition from empty to members - configure replication - Some(configure_replication(group_entry.external_group_id())) - } - (false, Some(_)) => { - // Already has members and replication - keep existing - group_entry.replication_info.clone() - } - (true, None) => { - // Already empty and no replication - keep none - None + Some(build_replication_info(group_entry.external_group_id())) } + (false, Some(_)) => group_entry.replication_info.clone(), + (true, None) => None, }; // Early return for no-replication case -> just update metadata @@ -1578,7 +1569,7 @@ fn process_membership_changes( /// Default level exclusion IDs to 0 for internal groups /// since they can only be configured internally without API calls. -fn configure_replication( +fn build_replication_info( external_group_id: MulticastGroupId, ) -> MulticastReplicationInfo { MulticastReplicationInfo { diff --git a/dpd/src/route.rs b/dpd/src/route.rs index df50344a..8bc7e779 100644 --- a/dpd/src/route.rs +++ b/dpd/src/route.rs @@ -104,6 +104,53 @@ // - There is a lot of almost-duplicated code throughout the stack to support // both IPv4 and IPv6 routes. We should look at using traits and/or // generics to coalesce common functionality into shared implementations. +// +// TTL handling +// ------------ +// Routed packets arriving with TTL==1 (IPv4) or hop_limit==1 (IPv6) must +// not be forwarded normally. The dataplane generates an ICMP time-exceeded +// (v4) or ICMPv6 time-exceeded (v6) response back to the sender. The tree +// enforces this at multiple sites: +// +// - IPv4 unicast (route_ipv4.rs + sidecar.p4): compound exact-match key +// `(idx, route_ttl_is_1)` on the route table installs separate actions +// for the TTL==1 and TTL>1 rows. +// - IPv6 multicast (sidecar.p4, MulticastRouter6): inline `hop_limit==1` +// check in the apply block generates time-exceeded after the lookup. +// - IPv6 unicast (route_ipv6.rs + sidecar.p4): per-prefix `skip_ttl` bit +// on the `index` action gates an inline `ICMP_ERROR_SETUP` invocation +// in the apply block (same pattern as v6 multicast). +// +// Service port handling +// --------------------- +// Routes whose target is `PortId::Internal(_)` (the CPU/AUX/PCIe port that +// delivers to dpd userspace) forward packets even when TTL==1, bypassing +// the normal TTL exceeded handling. Delivery to the local switch's +// userspace is not "forwarding" in the RFC sense. The packet has reached +// its destination, so TTL/hop_limit semantics do not apply. Without this, +// external traffic addressed to a switch-internal service that arrives +// with TTL==1 (e.g. after a long path) would get an ICMP time-exceeded +// reply instead of being delivered to the userspace handler. +// +// The discriminator is the type-level `PortId::Internal(_)` variant. The +// runtime `asic_port_id` is an opaque dpd-internal value whose numeric +// mapping to the service port varies by build and model configuration. +// +// v4/v6 encoding asymmetry +// -------------------------- +// v4 uses a compound table key `(idx, route_ttl_is_1)` and installs +// different per-target actions per TTL class. This supports ECMP groups +// that mix service-port and non-service-port targets. +// +// v6 uses a per-prefix `skip_ttl` bit on the `index` action plus an +// inline TTL-exceeded branch in the apply block. The bit gates whether +// the inline path or the normal route table runs. +// +// Trade-off in the v6 encoding: v6 ECMP groups cannot mix service-port +// targets with non-service-port targets, because the per-prefix bit +// cannot represent both behaviors simultaneously. `replace_route_targets` +// rejects such mixed sets up front. v4 has no such restriction because +// its compound key discriminates per-target. use std::collections::BTreeMap; use std::convert::TryFrom; @@ -127,6 +174,28 @@ use oxnet::{IpNet, Ipv4Net, Ipv6Net}; const MAX_TARGETS_IPV4: usize = 32; const MAX_TARGETS_IPV6: usize = 32; +// IPv4 route indices map to two physical forward-table entries because the +// route table still keys on `(idx, route_ttl_is_1)` and installs distinct +// forwarding vs. ttl_exceeded rows. IPv6 routes map to a single entry. +// TTL=1 is handled inline in the v6 ingress apply block, gated by a +// per-prefix `skip_ttl` bit on the index action. +const ROUTE_FWD_ENTRIES_PER_ROUTE_V4: u32 = 2; +const ROUTE_FWD_ENTRIES_PER_ROUTE_V6: u32 = 1; + +/// Convert a P4 forward-table size to freemap size, given the number of +/// physical entries each logical route consumes. +fn freemap_size_from_table( + table_size: u32, + entries_per_route: u32, +) -> DpdResult { + let logical_routes = table_size / entries_per_route; + u16::try_from(logical_routes).map_err(|_| { + DpdError::Invalid(format!( + "route table size {table_size} exceeds maximum supported" + )) + }) +} + #[derive(Clone, Debug, PartialEq, Eq)] struct Route { tag: String, @@ -263,6 +332,35 @@ pub struct RouteData { } impl RouteData { + fn new(log: &slog::Logger, asic_hdl: &asic::Handle) -> DpdResult { + let mut v4_freemap = freemap::FreeMap::new(log, "route_ipv4"); + let mut v6_freemap = freemap::FreeMap::new(log, "route_ipv6"); + + let v4_table_size = + table::Table::new(asic_hdl, TableType::RouteFwdIpv4)?.size(); + let v6_table_size = + table::Table::new(asic_hdl, TableType::RouteFwdIpv6)?.size(); + + v4_freemap.maybe_init(freemap_size_from_table( + v4_table_size, + ROUTE_FWD_ENTRIES_PER_ROUTE_V4, + )?); + v6_freemap.maybe_init_with_low( + 1, + freemap_size_from_table( + v6_table_size, + ROUTE_FWD_ENTRIES_PER_ROUTE_V6, + )?, + ); + + Ok(RouteData { + v4: BTreeMap::new(), + v6: BTreeMap::new(), + v4_freemap, + v6_freemap, + }) + } + pub fn insert( &mut self, subnet: impl Into, @@ -364,18 +462,38 @@ fn finalize_route( entry.index, entry.slots, ), - IpNet::V6(subnet) => table::route_ipv6::add_route_index( - switch, - &subnet, - entry.index, - entry.slots, - ), + IpNet::V6(subnet) => { + // If any target for this prefix routes to the user-space + // service port, suppress the dataplane TTL=1 exception so + // userspace still receives the packet. Mixed sets are + // rejected upstream, so any/all here are equivalent. + let skip_ttl = entry + .targets + .iter() + .any(|t| matches!(t.route.port_id, PortId::Internal(_))); + table::route_ipv6::add_route_index( + switch, + &subnet, + entry.index, + entry.slots, + skip_ttl, + ) + } } { Ok(_) => { route_data.insert(subnet, entry); Ok(()) } - Err(_) => cleanup_route(switch, route_data, None, entry), + Err(e) => { + // `cleanup_route` returns `Ok` unconditionally on + // best-effort cleanup. Swallowing it here would make the + // outer call appear to succeed when the LPM install + // actually failed, leaving dpd's in-memory route state + // and the P4 tables out of sync. Free the resources but + // propagate the original error. + let _ = cleanup_route(switch, route_data, None, entry); + Err(e) + } } } @@ -421,6 +539,30 @@ fn replace_route_targets( return Ok(()); } + // v6 prefixes drive TTL=1 handling per-prefix via the `skip_ttl` bit + // on the index action. Mixed sets (some targets routing to the + // service port, some not) would cause hash-selected non-service + // targets to silently skip the dataplane TTL exception. Reject up + // front so the bit's semantics stay coherent across all ECMP members. + // Discriminate by `PortId::Internal`, which is the type-level CPU + // port identity. The `asic_port_id` is an opaque dpd-internal + // value whose mapping to the service port varies by build. + if subnet.is_ipv6() { + let any_service = targets + .iter() + .any(|t| matches!(t.route.port_id, PortId::Internal(_))); + let all_service = targets + .iter() + .all(|t| matches!(t.route.port_id, PortId::Internal(_))); + if any_service && !all_service { + let _ = finalize_route(switch, route_data, subnet, old_entry); + return Err(DpdError::InvalidRoute(format!( + "ipv6 prefix {subnet}: ECMP targets cannot mix the \ + service port with normal egress ports" + ))); + } + } + // Allocate space in the p4 table for the new set of targets. let slots = targets.len() as u8; let is_ipv4 = subnet.is_ipv4(); @@ -448,6 +590,7 @@ fn replace_route_targets( let mut idx = new_entry.index; for target in targets { + let is_service = matches!(target.route.port_id, PortId::Internal(_)); if let Err(e) = match target.route.tgt_ip { IpAddr::V4(tgt_ip) => table::route_ipv4::add_route_target( switch, @@ -455,6 +598,7 @@ fn replace_route_targets( target.asic_port_id, tgt_ip, target.route.vlan_id, + is_service, ), IpAddr::V6(tgt_ip) => { if subnet.is_ipv4() { @@ -464,6 +608,7 @@ fn replace_route_targets( target.asic_port_id, tgt_ip, target.route.vlan_id, + is_service, ) } else { table::route_ipv6::add_route_target( @@ -516,19 +661,10 @@ fn add_route_locked( ) -> DpdResult<()> { info!(switch.log, "adding route {subnet} -> {:?}", route.tgt_ip); - // Verify that the slot freelist has been initialized - let max_targets; - if subnet.is_ipv4() { - max_targets = MAX_TARGETS_IPV4; - route_data - .v4_freemap - .maybe_init(switch.table_size(TableType::RouteFwdIpv4)? as u16); - } else { - max_targets = MAX_TARGETS_IPV6; - route_data - .v6_freemap - .maybe_init(switch.table_size(TableType::RouteFwdIpv6)? as u16); - } + // Freemap sizing is established during RouteData construction so the + // delete/recovery paths can safely run before the first add. + let max_targets = + if subnet.is_ipv4() { MAX_TARGETS_IPV4 } else { MAX_TARGETS_IPV6 }; // Get the old set of targets that we'll be adding to let mut targets = @@ -587,7 +723,7 @@ async fn set_route( if entry.targets.len() == 1 && entry.targets[0].route == route { Ok(()) } else if !replace { - Err(DpdError::Exists("route {cidr} already exists".into())) + Err(DpdError::Exists(format!("route {subnet} already exists"))) } else { info!(switch.log, "replacing subnet {subnet}"); let target = vec![NextHop { asic_port_id, route }]; @@ -904,11 +1040,9 @@ pub async fn reset(switch: &Switch) -> DpdResult<()> { Ok(()) } -pub fn init(log: &slog::Logger) -> RouteData { - RouteData { - v4: BTreeMap::new(), - v6: BTreeMap::new(), - v4_freemap: freemap::FreeMap::new(log, "route_ipv4"), - v6_freemap: freemap::FreeMap::new(log, "route_ipv6"), - } +pub fn init( + log: &slog::Logger, + asic_hdl: &asic::Handle, +) -> DpdResult { + RouteData::new(log, asic_hdl) } diff --git a/dpd/src/table/arp_ipv4.rs b/dpd/src/table/arp_ipv4.rs index 91ca2beb..50d5ddde 100644 --- a/dpd/src/table/arp_ipv4.rs +++ b/dpd/src/table/arp_ipv4.rs @@ -17,7 +17,7 @@ use aal::{ActionParse, MatchParse}; #[derive(MatchParse, Hash)] struct MatchKey { - #[match_xlate(name = "nexthop_ipv4")] + #[match_xlate(name = "nexthop")] ip: Ipv4Addr, } diff --git a/dpd/src/table/mcast/mcast_replication.rs b/dpd/src/table/mcast/mcast_replication.rs index 15274aec..20cc4b77 100644 --- a/dpd/src/table/mcast/mcast_replication.rs +++ b/dpd/src/table/mcast/mcast_replication.rs @@ -5,6 +5,9 @@ // Copyright 2026 Oxide Computer Company //! Table operations for multicast replication information. +//! +//! Only IPv6 replication groups are managed here. IPv4 multicast does not +//! require replication group tracking. use dpd_types::table; use std::net::Ipv6Addr; diff --git a/dpd/src/table/mcast/mcast_route.rs b/dpd/src/table/mcast/mcast_route.rs index cfa925d3..e383f459 100644 --- a/dpd/src/table/mcast/mcast_route.rs +++ b/dpd/src/table/mcast/mcast_route.rs @@ -13,7 +13,6 @@ //! VLAN-based access control (preventing VLAN translation) is handled by NAT //! ingress tables before encapsulation, not by route tables. -use dpd_types::table; use std::net::{Ipv4Addr, Ipv6Addr}; use aal::ActionParse; @@ -101,7 +100,7 @@ pub(crate) fn del_ipv4_entry(s: &Switch, route: Ipv4Addr) -> DpdResult<()> { pub(crate) fn ipv4_table_dump( s: &Switch, from_hardware: bool, -) -> DpdResult { +) -> DpdResult { s.table_dump::( TableType::RouteIpv4Mcast, from_hardware, @@ -112,7 +111,7 @@ pub(crate) fn ipv4_table_dump( pub(crate) fn ipv4_counter_fetch( s: &Switch, force_sync: bool, -) -> DpdResult> { +) -> DpdResult> { s.counter_fetch::(force_sync, TableType::RouteIpv4Mcast) } @@ -200,7 +199,7 @@ pub(crate) fn del_ipv6_entry(s: &Switch, route: Ipv6Addr) -> DpdResult<()> { pub(crate) fn ipv6_table_dump( s: &Switch, from_hardware: bool, -) -> DpdResult { +) -> DpdResult { s.table_dump::( TableType::RouteIpv6Mcast, from_hardware, @@ -211,7 +210,7 @@ pub(crate) fn ipv6_table_dump( pub(crate) fn ipv6_counter_fetch( s: &Switch, force_sync: bool, -) -> DpdResult> { +) -> DpdResult> { s.counter_fetch::(force_sync, TableType::RouteIpv6Mcast) } diff --git a/dpd/src/table/mod.rs b/dpd/src/table/mod.rs index 3afbd8cc..15a7cad4 100644 --- a/dpd/src/table/mod.rs +++ b/dpd/src/table/mod.rs @@ -16,7 +16,7 @@ use aal::ActionParse; use aal::MatchParse; use aal::TableOps; use common::table::TableType; -use dpd_types::table as dpd_table; +use dpd_types::table as views; pub mod arp_ipv4; pub mod attached_subnet_v4; @@ -214,7 +214,7 @@ pub fn get_entries( switch: &Switch, name: String, from_hardware: bool, -) -> DpdResult { +) -> DpdResult { match TableType::try_from(name.as_str())? { TableType::RouteIdxIpv4 => { route_ipv4::index_dump(switch, from_hardware) @@ -309,7 +309,7 @@ pub fn get_counters( switch: &Switch, force_sync: bool, name: String, -) -> DpdResult> { +) -> DpdResult> { match TableType::try_from(name.as_str())? { TableType::RouteIdxIpv4 => { route_ipv4::index_counter_fetch(switch, force_sync) @@ -381,6 +381,10 @@ pub fn get_counters( mcast::mcast_route::ipv6_counter_fetch(switch, force_sync) } #[cfg(feature = "multicast")] + TableType::PortMacAddressMcast => { + mac::mcast_counter_fetch(switch, force_sync) + } + #[cfg(feature = "multicast")] TableType::McastEgressDecapPorts => { mcast::mcast_egress::bitmap_counter_fetch(switch, force_sync) } @@ -388,10 +392,6 @@ pub fn get_counters( TableType::McastEgressPortMapping => { mcast::mcast_egress::port_mapping_counter_fetch(switch, force_sync) } - #[cfg(feature = "multicast")] - TableType::PortMacAddressMcast => { - mac::mcast_counter_fetch(switch, force_sync) - } x => Err(DpdError::Other(format!( "table {x} has no associated counters" ))), diff --git a/dpd/src/table/neighbor_ipv6.rs b/dpd/src/table/neighbor_ipv6.rs index 595553dc..9b2beaa4 100644 --- a/dpd/src/table/neighbor_ipv6.rs +++ b/dpd/src/table/neighbor_ipv6.rs @@ -19,7 +19,7 @@ use common::network::MacAddr; #[derive(MatchParse, Hash)] struct MatchKey { - #[match_xlate(name = "nexthop_ipv6")] + #[match_xlate(name = "nexthop")] ip: Ipv6Addr, } diff --git a/dpd/src/table/route_ipv4.rs b/dpd/src/table/route_ipv4.rs index e5b9d79e..025b3fd2 100644 --- a/dpd/src/table/route_ipv4.rs +++ b/dpd/src/table/route_ipv4.rs @@ -18,15 +18,17 @@ use oxnet::Ipv4Net; use slog::error; use slog::info; -// Used for indentifying entries in the index->route_data table +// Used for identifying entries in the index->route_data table #[derive(MatchParse, Hash, Debug)] struct IndexKey { #[match_xlate(type = "value")] idx: u16, + #[match_xlate(name = "route_ttl_is_1", type = "value")] + route_ttl_is_1: bool, } // Route entries stored in the index->route_data table -#[derive(ActionParse, Debug)] +#[derive(ActionParse, Debug, Clone, Copy)] enum RouteAction { #[action_xlate(name = "forward")] Forward { port: u16, nexthop: Ipv4Addr }, @@ -36,6 +38,8 @@ enum RouteAction { ForwardVlan { port: u16, nexthop: Ipv4Addr, vlan_id: u16 }, #[action_xlate(name = "forward_vlan_v6")] ForwardVlanV6 { port: u16, nexthop: Ipv6Addr, vlan_id: u16 }, + #[action_xlate(name = "ttl_exceeded")] + TtlExceeded, } // Used to identify entries in the route->index table @@ -66,17 +70,17 @@ pub fn add_route_index( match s.table_entry_add(TableType::RouteIdxIpv4, &match_key, &action_data) { Ok(()) => { info!(s.log, "added ipv4 route index"; - "route" => %cidr, - "index" => %idx, - "slots" => %slots); + "route" => %cidr, + "index" => %idx, + "slots" => %slots); Ok(()) } Err(e) => { error!(s.log, "failed to add ipv4 route index"; - "route" => %cidr, - "index" => %idx, - "slots" => %slots, - "error" => %e); + "route" => %cidr, + "index" => %idx, + "slots" => %slots, + "error" => %e); Err(e) } } @@ -90,21 +94,25 @@ pub fn delete_route_index(s: &Switch, cidr: &Ipv4Net) -> DpdResult<()> { .map(|_| info!(s.log, "deleted ipv4 index"; "route" => %cidr)) .map_err(|e| { error!(s.log, "failed to delete ipv4 index"; - "route" => %cidr, - "error" => %e); + "route" => %cidr, + "error" => %e); e }) } -// Add a target into the route_data table at the given index +// Add a target into the route_data table at the given index. `is_service` +// reflects whether the route target is the CPU/service port, which the +// caller determines by checking `PortId::Internal(_)` on the target's +// port_id (the type-stable discriminator). pub fn add_route_target( s: &Switch, idx: u16, port: u16, nexthop: Ipv4Addr, vlan_id: Option, + is_service: bool, ) -> DpdResult<()> { - let match_key = IndexKey { idx }; + let match_key = IndexKey { idx, route_ttl_is_1: false }; let action_data = match vlan_id { None => RouteAction::Forward { port, nexthop }, Some(vlan_id) => { @@ -116,32 +124,70 @@ pub fn add_route_target( match s.table_entry_add(TableType::RouteFwdIpv4, &match_key, &action_data) { Ok(()) => { info!(s.log, "added ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "vlan_id" => ?vlan_id); - Ok(()) + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "vlan_id" => ?vlan_id); + add_ttl_entry(s, idx, &match_key, &action_data, is_service) } Err(e) => { error!(s.log, "failed to add ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "error" => %e); + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "error" => %e); Err(e) } } } -// Add a target into the route_data table at the given index +// Add the TTL==1 entry for a route target. +// +// For service port routes, we forward even when TTL==1 (bypassing ICMP +// TTL exceeded). For all other routes, we trigger TTL exceeded handling. +// This matches the P4 behavior: `ttl == 1 && !IS_SERVICE(fwd.port)`. +fn add_ttl_entry( + s: &Switch, + idx: u16, + forward_key: &IndexKey, + forward_action: &RouteAction, + is_service: bool, +) -> DpdResult<()> { + let ttl_match_key = IndexKey { idx, route_ttl_is_1: true }; + + let ttl_action = + if is_service { *forward_action } else { RouteAction::TtlExceeded }; + + if let Err(e) = + s.table_entry_add(TableType::RouteFwdIpv4, &ttl_match_key, &ttl_action) + { + error!(s.log, "failed to add ipv4 ttl entry"; + "index" => idx, + "error" => %e); + if let Err(cleanup_err) = + s.table_entry_del(TableType::RouteFwdIpv4, forward_key) + { + error!(s.log, "failed to clean up ipv4 route entry"; + "index" => idx, + "error" => %cleanup_err); + } + return Err(e); + } + Ok(()) +} + +// Add a target with IPv6 nexthop into the route_data table at the given index +// (used for v4-over-v6 routing). `is_service` is determined by the caller +// from the target's `PortId::Internal(_)` variant. pub fn add_route_target_v6( s: &Switch, idx: u16, port: u16, nexthop: Ipv6Addr, vlan_id: Option, + is_service: bool, ) -> DpdResult<()> { - let match_key = IndexKey { idx }; + let match_key = IndexKey { idx, route_ttl_is_1: false }; let action_data = match vlan_id { None => RouteAction::ForwardV6 { port, nexthop }, Some(vlan_id) => { @@ -152,36 +198,50 @@ pub fn add_route_target_v6( match s.table_entry_add(TableType::RouteFwdIpv4, &match_key, &action_data) { Ok(()) => { - info!(s.log, "added ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "vlan_id" => ?vlan_id); - Ok(()) + info!(s.log, "added ipv4 route entry (v6 nexthop)"; + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "vlan_id" => ?vlan_id); + add_ttl_entry(s, idx, &match_key, &action_data, is_service) } Err(e) => { - error!(s.log, "failed to add ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "error" => %e); + error!(s.log, "failed to add ipv4 route entry (v6 nexthop)"; + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "error" => %e); Err(e) } } } -// Remove the route data at the given index +// Remove the route data at the given index (both forward and ttl_exceeded entries). +// The main entry (route_ttl_is_1=false) must succeed. The TTL==1 companion entry +// may not exist for routes created before the compound key change, so we only +// log a warning for TTL==1 entry failures instead of returning an error. pub fn delete_route_target(s: &Switch, idx: u16) -> DpdResult<()> { - let match_key = IndexKey { idx }; + // Delete the main entry first (route_ttl_is_1=false). + let main_key = IndexKey { idx, route_ttl_is_1: false }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv4, &main_key) { + error!(s.log, "failed to delete ipv4 route entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv4 route entry"; "index" => %idx); - s.table_entry_del(TableType::RouteFwdIpv4, &match_key) - .map(|_| info!(s.log, "deleted ipv4 route entry"; "index" => %idx)) - .map_err(|e| { - error!(s.log, "failed to delete ipv4 route entry"; - "index" => %idx, - "error" => %e); - e - }) + // Delete the TTL==1 companion entry. + let ttl_key = IndexKey { idx, route_ttl_is_1: true }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv4, &ttl_key) { + error!(s.log, "failed to delete ipv4 route ttl==1 entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv4 route ttl==1 entry"; "index" => %idx); + + Ok(()) } pub fn forward_dump( @@ -220,14 +280,14 @@ pub fn reset(s: &Switch) -> DpdResult<()> { .map(|_| info!(s.log, "reset ipv4 route-index table")) .map_err(|e| { error!(s.log, "failed to clear ipv4 route-index table"; - "error" => %e); + "error" => %e); e })?; s.table_clear(TableType::RouteFwdIpv4) .map(|_| info!(s.log, "reset ipv4 route-data table")) .map_err(|e| { error!(s.log, "failed to clear ipv4 route-data table"; - "error" => %e); + "error" => %e); e }) } diff --git a/dpd/src/table/route_ipv6.rs b/dpd/src/table/route_ipv6.rs index 81dc2ec3..69409a4d 100644 --- a/dpd/src/table/route_ipv6.rs +++ b/dpd/src/table/route_ipv6.rs @@ -17,7 +17,7 @@ use oxnet::Ipv6Net; use slog::error; use slog::info; -// Used for indentifying entries in the index->route_data table +// Used for identifying entries in the index->route_data table #[derive(MatchParse, Hash, Debug)] struct IndexKey { #[match_xlate(type = "value")] @@ -25,7 +25,7 @@ struct IndexKey { } // Route entries stored in the index->route_data table -#[derive(ActionParse, Debug)] +#[derive(ActionParse, Debug, Clone, Copy)] enum RouteAction { #[action_xlate(name = "forward")] Forward { port: u16, nexthop: Ipv6Addr }, @@ -40,21 +40,27 @@ struct RouteKey { dst_addr: Ipv6Net, } -// Indexes stored in the route->index table +// Indexes stored in the route->index table. `skip_ttl` is set when any +// target for this prefix routes to the user-space service port, where the +// packet is delivered to the switch rather than forwarded onward. In that +// case the TTL=1 exception is suppressed so userspace still receives the +// packet. #[derive(ActionParse, Debug)] enum IndexAction { #[action_xlate(name = "index")] - Index { idx: u16, slots: u8 }, + Index { idx: u16, slots: u8, skip_ttl: u8 }, } -/// Add an entry to the route->index table +/// Add an entry to the route->index table. pub fn add_route_index( s: &Switch, cidr: &Ipv6Net, idx: u16, slots: u8, + skip_ttl: bool, ) -> DpdResult<()> { - let action_data = IndexAction::Index { idx, slots }; + let action_data = + IndexAction::Index { idx, slots, skip_ttl: skip_ttl as u8 }; let match_key = RouteKey { dst_addr: *cidr }; @@ -63,7 +69,8 @@ pub fn add_route_index( info!(s.log, "added ipv6 route index"; "route" => %cidr, "index" => %idx, - "slots" => %slots); + "slots" => %slots, + "skip_ttl" => %skip_ttl); Ok(()) } Err(e) => { @@ -71,13 +78,14 @@ pub fn add_route_index( "route" => %cidr, "index" => %idx, "slots" => %slots, + "skip_ttl" => %skip_ttl, "error" => %e); Err(e) } } } -/// Remove an entry from the route->index table +/// Remove an entry from the route->index table. pub fn delete_route_index(s: &Switch, cidr: &Ipv6Net) -> DpdResult<()> { let match_key = RouteKey { dst_addr: *cidr }; @@ -91,7 +99,9 @@ pub fn delete_route_index(s: &Switch, cidr: &Ipv6Net) -> DpdResult<()> { }) } -// Add a target into the route_data table at the given index +// Add a target into the route_data table at the given index. TTL=1 +// handling is governed at the prefix level by the `skip_ttl` bit on the +// index action, not per-target. pub fn add_route_target( s: &Switch, idx: u16, @@ -128,18 +138,21 @@ pub fn add_route_target( } } -// Remove the route data at the given index +/// Remove the route data at the given index. pub fn delete_route_target(s: &Switch, idx: u16) -> DpdResult<()> { let match_key = IndexKey { idx }; - - s.table_entry_del(TableType::RouteFwdIpv6, &match_key) - .map(|_| info!(s.log, "deleted ipv6 route entry"; "index" => %idx)) - .map_err(|e| { + match s.table_entry_del(TableType::RouteFwdIpv6, &match_key) { + Ok(_) => { + info!(s.log, "deleted ipv6 route entry"; "index" => %idx); + Ok(()) + } + Err(e) => { error!(s.log, "failed to delete ipv6 route entry"; "index" => %idx, "error" => %e); - e - }) + Err(e) + } + } } pub fn forward_dump( diff --git a/dpd/src/table/uplink.rs b/dpd/src/table/uplink.rs index d8f04abd..9b47a557 100644 --- a/dpd/src/table/uplink.rs +++ b/dpd/src/table/uplink.rs @@ -28,7 +28,7 @@ enum IngressAction { #[derive(MatchParse, Debug, Hash)] struct EgressMatchKey { - #[match_xlate(name = "ucast_egress_port")] + #[match_xlate(name = "egress_port")] out_port: u16, } diff --git a/packet/src/eth.rs b/packet/src/eth.rs index 0cff359a..4cbdca8d 100644 --- a/packet/src/eth.rs +++ b/packet/src/eth.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use std::fmt; diff --git a/swadm/tests/counters.rs b/swadm/tests/counters.rs index f01f7e15..9c24bb88 100644 --- a/swadm/tests/counters.rs +++ b/swadm/tests/counters.rs @@ -37,30 +37,28 @@ fn test_p4_counter_list() { // Verify output is not empty and contains expected counter information assert!(!stdout.is_empty(), "Counter list output should not be empty"); - // Expected P4 counters from dpd/src/counters.rs COUNTERS array + // Expected P4 counters from dpd/src/counters.rs + // BASE_COUNTERS let base_counters = [ "Service", "Ingress", "Packet", - "Egress", "Ingress_Drop_Port", "Ingress_Drop_Reason", + "Forwarded", + "Unicast", + "Multicast_Link_Local", + "Egress_Drop_Port", + "Egress_Drop_Reason", ]; + // MULTICAST_COUNTERS #[cfg(not(feature = "multicast"))] - let multicast_counters = Vec::new(); + let multicast_counters: Vec<&str> = Vec::new(); #[cfg(feature = "multicast")] - let multicast_counters = vec![ - "Egress_Drop_Port", - "Egress_Drop_Reason", - "Unicast", - "Multicast", - "Multicast_External", - "Multicast_Link_Local", - "Multicast_Underlay", - "Multicast_Drop", - ]; + let multicast_counters = + vec!["Multicast", "Multicast_External", "Multicast_Underlay"]; // Verify all expected counters are present in the output for counter in base_counters.iter().chain(multicast_counters.iter()) { diff --git a/tools/veth_setup.sh b/tools/veth_setup.sh index 1fb1195d..a3602425 100755 --- a/tools/veth_setup.sh +++ b/tools/veth_setup.sh @@ -4,7 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/ # -# Copyright 2025 Oxide Computer Company +# Copyright 2026 Oxide Computer Company function config_veth() { /usr/bin/ip link set dev $1 mtu 10240 up @@ -19,7 +19,9 @@ function config_veth() { function add_port() { veth0="veth$(($1*2))" veth1="veth$(($1*2+1))" - echo Adding $veth0 and $veth1 for port $1 + if [ "${VETH_VERBOSE:-1}" -eq 1 ]; then + echo "Adding $veth0 and $veth1 for port $1" + fi if ! /usr/bin/ip link show $veth0 &> /dev/null; then /usr/bin/ip link add name $veth0 type veth peer name $veth1 &> /dev/null @@ -39,7 +41,11 @@ else ports=16 fi -echo "building veths for $ports ports" +if [ "${VETH_VERBOSE:-1}" -eq 1 ]; then + echo "building veths for $ports ports" +else + echo "veth setup: ports 0..$ports plus 125" +fi port_list="`seq 0 $ports` 125" for port in $port_list; do