From fd7144211bd79f453649cd138e69336321a9dd8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Mon, 4 May 2026 19:50:12 -0300 Subject: [PATCH 1/6] feat(metrics): add lean_gossip_mesh_peers gauge Tracks the number of peers currently in the gossipsub mesh, broken down by client name (resolved via the existing peer-id name registry). Defined in leanMetrics PR #35. The gauge is refreshed on a 1s interval inside the swarm adapter loop because mesh membership is held in the gossipsub behaviour and changes implicitly during heartbeats (700ms) without dedicated graft/prune events. Reset-then-repopulate ensures clients that drop out of the mesh do not leave stale label counts. --- crates/net/p2p/src/metrics.rs | 26 ++++++++++++++++++++++++++ crates/net/p2p/src/swarm_adapter.rs | 19 +++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/crates/net/p2p/src/metrics.rs b/crates/net/p2p/src/metrics.rs index d6d2c29b..d038b187 100644 --- a/crates/net/p2p/src/metrics.rs +++ b/crates/net/p2p/src/metrics.rs @@ -68,6 +68,15 @@ static LEAN_PEER_DISCONNECTION_EVENTS_TOTAL: LazyLock = LazyLock: .unwrap() }); +static LEAN_GOSSIP_MESH_PEERS: LazyLock = LazyLock::new(|| { + register_int_gauge_vec!( + "lean_gossip_mesh_peers", + "Number of peers in the gossipsub mesh", + &["client"] + ) + .unwrap() +}); + // --- Gossip Message Size Histograms --- static LEAN_GOSSIP_BLOCK_SIZE_BYTES: LazyLock = LazyLock::new(|| { @@ -168,3 +177,20 @@ pub fn notify_peer_disconnected(peer_id: &Option, direction: &str, reaso let name = resolve(peer_id); LEAN_CONNECTED_PEERS.with_label_values(&[name]).dec(); } + +/// Refresh the gossipsub mesh peers gauge from the current mesh peer set. +/// +/// Called periodically by the swarm adapter. The gauge is reset before +/// re-population so client labels for peers no longer in the mesh drop +/// off rather than retaining stale counts. +pub fn update_gossip_mesh_peers<'a>(peers: impl Iterator) { + let mut counts: HashMap<&'static str, i64> = HashMap::new(); + for peer_id in peers { + let name = resolve(&Some(*peer_id)); + *counts.entry(name).or_insert(0) += 1; + } + LEAN_GOSSIP_MESH_PEERS.reset(); + for (name, count) in counts { + LEAN_GOSSIP_MESH_PEERS.with_label_values(&[name]).set(count); + } +} diff --git a/crates/net/p2p/src/swarm_adapter.rs b/crates/net/p2p/src/swarm_adapter.rs index 1def5406..240e0818 100644 --- a/crates/net/p2p/src/swarm_adapter.rs +++ b/crates/net/p2p/src/swarm_adapter.rs @@ -1,13 +1,21 @@ +use std::time::Duration; + use libp2p::{ Multiaddr, PeerId, StreamProtocol, futures::StreamExt, request_response::{self, OutboundRequestId}, swarm::SwarmEvent, }; -use tokio::sync::mpsc; +use tokio::{sync::mpsc, time::MissedTickBehavior}; use tracing::{error, warn}; -use crate::{Behaviour, BehaviourEvent, req_resp::Request, req_resp::Response}; +use crate::{Behaviour, BehaviourEvent, metrics, req_resp::Request, req_resp::Response}; + +/// Interval between gossipsub mesh peer metric refreshes. +/// +/// Slightly slower than the gossipsub heartbeat (700ms) so the gauge +/// reflects post-heartbeat mesh state with minimal polling overhead. +const MESH_METRIC_REFRESH_INTERVAL: Duration = Duration::from_secs(1); pub enum SwarmCommand { Publish { @@ -106,6 +114,8 @@ async fn swarm_loop( event_tx: mpsc::UnboundedSender>, mut cmd_rx: mpsc::UnboundedReceiver, ) { + let mut mesh_metric_tick = tokio::time::interval(MESH_METRIC_REFRESH_INTERVAL); + mesh_metric_tick.set_missed_tick_behavior(MissedTickBehavior::Skip); loop { tokio::select! { event = swarm.next() => { @@ -116,6 +126,11 @@ async fn swarm_loop( let Some(cmd) = cmd else { break }; execute_command(&mut swarm, cmd); } + _ = mesh_metric_tick.tick() => { + metrics::update_gossip_mesh_peers( + swarm.behaviour().gossipsub.all_mesh_peers(), + ); + } } } error!("Swarm adapter loop exited — P2P networking is no longer functional"); From 8033d3b307db0082c448bcd4cccdae82b62b9911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 7 May 2026 11:29:23 -0300 Subject: [PATCH 2/6] chore: bump time between metric refreshes to 10s --- crates/net/p2p/src/swarm_adapter.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/net/p2p/src/swarm_adapter.rs b/crates/net/p2p/src/swarm_adapter.rs index 240e0818..79ee03e8 100644 --- a/crates/net/p2p/src/swarm_adapter.rs +++ b/crates/net/p2p/src/swarm_adapter.rs @@ -12,10 +12,7 @@ use tracing::{error, warn}; use crate::{Behaviour, BehaviourEvent, metrics, req_resp::Request, req_resp::Response}; /// Interval between gossipsub mesh peer metric refreshes. -/// -/// Slightly slower than the gossipsub heartbeat (700ms) so the gauge -/// reflects post-heartbeat mesh state with minimal polling overhead. -const MESH_METRIC_REFRESH_INTERVAL: Duration = Duration::from_secs(1); +const MESH_METRIC_REFRESH_INTERVAL: Duration = Duration::from_secs(10); pub enum SwarmCommand { Publish { From df1aa5801e101ead6af80232be93b0af64bc32df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 7 May 2026 13:04:19 -0300 Subject: [PATCH 3/6] fix(metrics): avoid scrape-window gap on mesh peers refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace IntGaugeVec::reset() + repopulate with set(0) on departed labels. The reset() variant cleared all child series in one registry write, but the subsequent set() calls were separate writes — a Prometheus scrape that landed in between observed an empty gauge, which would falsely trigger any "no mesh peers" alert. Existing children are enumerated via Collector::collect() so we don't have to track the previous label set ourselves. Also acquire the NODE_NAME_REGISTRY read lock once for the whole batch instead of per peer. --- crates/common/metrics/src/lib.rs | 4 +-- crates/net/p2p/src/metrics.rs | 46 +++++++++++++++++++------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/crates/common/metrics/src/lib.rs b/crates/common/metrics/src/lib.rs index 9e508bbd..83539a2e 100644 --- a/crates/common/metrics/src/lib.rs +++ b/crates/common/metrics/src/lib.rs @@ -6,8 +6,8 @@ pub mod timing; // Re-export prometheus types and macros we use pub use prometheus::{ Encoder, Error as PrometheusError, Histogram, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, - TextEncoder, gather, register_histogram, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, + TextEncoder, core::Collector, gather, register_histogram, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, }; // Re-export commonly used items diff --git a/crates/net/p2p/src/metrics.rs b/crates/net/p2p/src/metrics.rs index d038b187..34b86e5d 100644 --- a/crates/net/p2p/src/metrics.rs +++ b/crates/net/p2p/src/metrics.rs @@ -8,8 +8,8 @@ use std::{ use ethlambda_metrics::*; use ethlambda_types::primitives::H256; use libp2p::{ + identity::{secp256k1, Keypair}, PeerId, - identity::{Keypair, secp256k1}, }; static NODE_NAME_REGISTRY: LazyLock>> = @@ -50,6 +50,15 @@ static LEAN_CONNECTED_PEERS: LazyLock = LazyLock::new(|| { .unwrap() }); +static LEAN_GOSSIP_MESH_PEERS: LazyLock = LazyLock::new(|| { + register_int_gauge_vec!( + "lean_gossip_mesh_peers", + "Number of peers in the gossipsub mesh", + &["client"] + ) + .unwrap() +}); + static LEAN_PEER_CONNECTION_EVENTS_TOTAL: LazyLock = LazyLock::new(|| { register_int_counter_vec!( "lean_peer_connection_events_total", @@ -68,15 +77,6 @@ static LEAN_PEER_DISCONNECTION_EVENTS_TOTAL: LazyLock = LazyLock: .unwrap() }); -static LEAN_GOSSIP_MESH_PEERS: LazyLock = LazyLock::new(|| { - register_int_gauge_vec!( - "lean_gossip_mesh_peers", - "Number of peers in the gossipsub mesh", - &["client"] - ) - .unwrap() -}); - // --- Gossip Message Size Histograms --- static LEAN_GOSSIP_BLOCK_SIZE_BYTES: LazyLock = LazyLock::new(|| { @@ -179,17 +179,27 @@ pub fn notify_peer_disconnected(peer_id: &Option, direction: &str, reaso } /// Refresh the gossipsub mesh peers gauge from the current mesh peer set. -/// -/// Called periodically by the swarm adapter. The gauge is reset before -/// re-population so client labels for peers no longer in the mesh drop -/// off rather than retaining stale counts. pub fn update_gossip_mesh_peers<'a>(peers: impl Iterator) { let mut counts: HashMap<&'static str, i64> = HashMap::new(); - for peer_id in peers { - let name = resolve(&Some(*peer_id)); - *counts.entry(name).or_insert(0) += 1; + { + let registry = NODE_NAME_REGISTRY.read().unwrap(); + for peer_id in peers { + let name = registry.get(peer_id).copied().unwrap_or("unknown"); + *counts.entry(name).or_default() += 1; + } + } + // Zero out client labels that were published before but aren't in the + // current mesh, by enumerating the gauge's existing children. + for family in LEAN_GOSSIP_MESH_PEERS.collect() { + for metric in family.get_metric() { + for label in metric.get_label() { + let value = label.value(); + if !counts.contains_key(value) { + LEAN_GOSSIP_MESH_PEERS.with_label_values(&[value]).set(0); + } + } + } } - LEAN_GOSSIP_MESH_PEERS.reset(); for (name, count) in counts { LEAN_GOSSIP_MESH_PEERS.with_label_values(&[name]).set(count); } From fe9eec3d4c37505c33d7570a01aa2a87376c863b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 7 May 2026 13:28:16 -0300 Subject: [PATCH 4/6] chore: cargo fmt --- crates/net/p2p/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/net/p2p/src/metrics.rs b/crates/net/p2p/src/metrics.rs index 34b86e5d..441cb52a 100644 --- a/crates/net/p2p/src/metrics.rs +++ b/crates/net/p2p/src/metrics.rs @@ -8,8 +8,8 @@ use std::{ use ethlambda_metrics::*; use ethlambda_types::primitives::H256; use libp2p::{ - identity::{secp256k1, Keypair}, PeerId, + identity::{Keypair, secp256k1}, }; static NODE_NAME_REGISTRY: LazyLock>> = From 4c7714e83671fb14af852b1931e6f11e21967c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 7 May 2026 16:19:32 -0300 Subject: [PATCH 5/6] refactor(metrics): unify mesh peers refresh into a single set() pass Seed the counts map with 0 for previously-published labels so the final set() loop covers both current and departed clients in one pass, rather than walking the existing children twice. --- crates/net/p2p/src/metrics.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/crates/net/p2p/src/metrics.rs b/crates/net/p2p/src/metrics.rs index 441cb52a..9ca79401 100644 --- a/crates/net/p2p/src/metrics.rs +++ b/crates/net/p2p/src/metrics.rs @@ -180,27 +180,24 @@ pub fn notify_peer_disconnected(peer_id: &Option, direction: &str, reaso /// Refresh the gossipsub mesh peers gauge from the current mesh peer set. pub fn update_gossip_mesh_peers<'a>(peers: impl Iterator) { - let mut counts: HashMap<&'static str, i64> = HashMap::new(); + let mut counts: HashMap = HashMap::new(); { let registry = NODE_NAME_REGISTRY.read().unwrap(); for peer_id in peers { let name = registry.get(peer_id).copied().unwrap_or("unknown"); - *counts.entry(name).or_default() += 1; + *counts.entry(name.to_string()).or_default() += 1; } } - // Zero out client labels that were published before but aren't in the - // current mesh, by enumerating the gauge's existing children. + // Seed previously-published labels with 0 so departed clients fall to + // zero in the single set() pass below. for family in LEAN_GOSSIP_MESH_PEERS.collect() { for metric in family.get_metric() { for label in metric.get_label() { - let value = label.value(); - if !counts.contains_key(value) { - LEAN_GOSSIP_MESH_PEERS.with_label_values(&[value]).set(0); - } + counts.entry(label.value().to_string()).or_insert(0); } } } for (name, count) in counts { - LEAN_GOSSIP_MESH_PEERS.with_label_values(&[name]).set(count); + LEAN_GOSSIP_MESH_PEERS.with_label_values(&[&name]).set(count); } } From 8709ed8e22b9d5101a481006eeb1559e0b38d310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 7 May 2026 16:54:01 -0300 Subject: [PATCH 6/6] chore: cargo fmt --- crates/net/p2p/src/metrics.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/net/p2p/src/metrics.rs b/crates/net/p2p/src/metrics.rs index 9ca79401..19845258 100644 --- a/crates/net/p2p/src/metrics.rs +++ b/crates/net/p2p/src/metrics.rs @@ -198,6 +198,8 @@ pub fn update_gossip_mesh_peers<'a>(peers: impl Iterator) { } } for (name, count) in counts { - LEAN_GOSSIP_MESH_PEERS.with_label_values(&[&name]).set(count); + LEAN_GOSSIP_MESH_PEERS + .with_label_values(&[&name]) + .set(count); } }