From 0644feb74c68e024645e59738f30b5561c2b5178 Mon Sep 17 00:00:00 2001 From: Steve NCA Date: Tue, 2 Jun 2026 12:47:23 +0000 Subject: [PATCH] 0.8.0-dev6: sanitize whitespace in NATS subject target parts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hot-fix to dev5, caught seconds after the dev5 deploy on cpn-ful-netcortex1. Meraki MS switches expose interface names like 'Port 3' / 'Port 4' with literal spaces. NATS forbids whitespace in subjects, so the publisher's validator (correctly) rejected every such event, dropping link-state events for every Meraki MS port: snmp.link_state.publish_failed device=cpn-arlington-ms1 interface=Port 3 error=sensory subject target part 0='cpn-arlington-ms1|Port 3' contains whitespace The fix collapses any whitespace run in each target part to a single '_' at the SensoryPublisher boundary, so every future publisher (Meraki webhook, SNMP trap, gNMI dial-out) inherits the behavior without each one re-implementing it. The original identifier is preserved verbatim in the payload's interface/device/target fields, so consumers that need the human-readable name still have it. Dots in target parts remain a hard error — they are the NATS token separator and silently splitting would mask a programmer bug. Tests: 3 new cases (sanitization, multi-run collapse, dots-still-reject). Co-authored-by: Cursor --- CHANGELOG.md | 43 ++++++++++++++ netcortex/__init__.py | 2 +- netcortex/thalamus/sensory_publisher.py | 33 ++++++++++- pyproject.toml | 2 +- tests/thalamus/test_sensory_publisher.py | 72 ++++++++++++++++++++++++ 5 files changed, 148 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 512318f..3ee0277 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,49 @@ and this file MUST be updated together whenever `__version__` changes. --- +## [0.8.0-dev6] — Sanitize whitespace in NATS subject target parts + +Hot-fix to dev5, caught within seconds of the dev5 deploy on +`cpn-ful-netcortex1`. Meraki MS switches expose interface names like +`"Port 3"`, `"Port 4"` with literal spaces — NATS subjects forbid +whitespace, so the publisher's validator (correctly) rejected every +such event and the worker log filled with: + +``` +snmp.link_state.publish_failed device=cpn-arlington-ms1 interface=Port 3 + error=sensory subject target part 0='cpn-arlington-ms1|Port 3' + contains whitespace; NATS subjects must be whitespace-free +``` + +The fix sanitizes whitespace at the `SensoryPublisher` boundary so every +future publisher (Meraki webhook, SNMP trap, gNMI dial-out) inherits +the behavior without each one re-implementing it. + +### Changed +- `SensoryPublisher.publish` now collapses any whitespace run in each + target part to a single `_` before building the subject. The original + identifier is preserved verbatim in the payload's `interface` / + `device` / `target` fields, so consumers that need the human-readable + name still have it. Example: `Port 3` → subject token `Port_3`, + payload `interface: "Port 3"`. + +### Unchanged on purpose +- Dots in target parts remain a hard error (they are the NATS token + separator; silently splitting would mask a programmer bug). +- The `sensory_subject()` validator and all other Protocol-level + rejections are unchanged. + +### Tests +- Added 3 cases to `tests/thalamus/test_sensory_publisher.py`: + whitespace sanitization, multi-whitespace-run collapse, dots-still-reject. + +### Operational note +Once deployed, the previously-suppressed Meraki MS port events will +start flowing through the bus on the next SNMP poll cycle and (if any +are actually down) will produce `:ReflexEvent` nodes in Neo4j. + +--- + ## [0.8.0-dev5] — First sensory publisher: SNMP link-state → NATS → :ReflexEvent Closes the loop on the dev1–dev4 foundation. The bus has been live in diff --git a/netcortex/__init__.py b/netcortex/__init__.py index 7980e03..6536f62 100644 --- a/netcortex/__init__.py +++ b/netcortex/__init__.py @@ -22,4 +22,4 @@ ``CHANGELOG.md`` MUST be kept in sync whenever ``__version__`` changes. """ -__version__ = "0.8.0-dev5" +__version__ = "0.8.0-dev6" diff --git a/netcortex/thalamus/sensory_publisher.py b/netcortex/thalamus/sensory_publisher.py index 14dc347..e12f648 100644 --- a/netcortex/thalamus/sensory_publisher.py +++ b/netcortex/thalamus/sensory_publisher.py @@ -9,6 +9,11 @@ fails at startup not at runtime * tag the payload with ``recorded_at`` and ``source`` for downstream fusion / dedup without each publisher reinventing the same code +* sanitize whitespace in target parts — vendor identifiers like Meraki's + ``"Port 3"`` interface names contain spaces, which NATS subjects + forbid. We replace each whitespace run with ``_`` so the subject is + valid and the original identifier is preserved in the payload's + ``interface``/``target`` field. * emit a structured ``bus.published`` log line so an operator can see the full event flow without a debugger * swallow-and-log errors so a transient NATS hiccup doesn't crash the @@ -25,6 +30,7 @@ from __future__ import annotations import logging +import re from datetime import datetime, timezone from typing import Any @@ -33,6 +39,25 @@ _LOG = logging.getLogger(__name__) +# Matches one-or-more whitespace characters. Whitespace in NATS subjects +# is invalid; we collapse any run (regular space, tab, NBSP, etc.) to a +# single ``_`` so vendor identifiers like "Port 3" or "Tunnel 1" become +# valid subject tokens without losing the structure of the original +# identifier (operators can still read "Port_3" and recognize "Port 3"). +_WHITESPACE_RUN = re.compile(r"\s+") + + +def _sanitize_target_part(part: str) -> str: + """Make one target part NATS-subject-safe. + + Currently only collapses whitespace runs to ``_``. Dots are a hard + programmer error (they are the NATS token separator and would split + the target across tokens), so we deliberately do NOT silently + replace them — the :func:`sensory_subject` validator catches them + and raises, which is what we want. + """ + return _WHITESPACE_RUN.sub("_", part) + class SensoryPublisher: """Adapter-friendly facade over an :class:`EventBus`. @@ -76,9 +101,13 @@ async def publish( Programmer errors (unknown event_class, malformed target) raise immediately so they surface in unit tests rather than in - production logs. + production logs. Whitespace in target parts is **not** an error + — vendor identifiers commonly include spaces and we sanitize + them transparently. The original (un-sanitized) identifier is + preserved in the payload by the caller. """ - subject = sensory_subject(event_class, self._source, *target_parts) + sanitized_parts = tuple(_sanitize_target_part(p) for p in target_parts) + subject = sensory_subject(event_class, self._source, *sanitized_parts) full_payload = dict(payload or {}) # `source` echoed into the payload so downstream consumers that # don't parse the subject (or that re-emit on a derived subject) diff --git a/pyproject.toml b/pyproject.toml index 32788bd..1d68e86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "netcortex" -version = "0.8.0.dev5" +version = "0.8.0.dev6" description = "The intelligence layer for your network — multi-dimensional graph of the network bridging Meraki, Catalyst Center, Intersight, and more with NetBox as SoT" readme = "README.md" requires-python = ">=3.12" diff --git a/tests/thalamus/test_sensory_publisher.py b/tests/thalamus/test_sensory_publisher.py index 7228b09..8a4b7bb 100644 --- a/tests/thalamus/test_sensory_publisher.py +++ b/tests/thalamus/test_sensory_publisher.py @@ -86,6 +86,78 @@ async def broken_publish(*args: Any, **kwargs: Any) -> None: await pub.publish("link_down", "r1|Gi0/1") +@pytest.mark.asyncio +async def test_publish_sanitizes_whitespace_in_target_parts() -> None: + """Vendor identifiers like Meraki's 'Port 3' must round-trip cleanly. + + Real bug caught in the dev5 deploy: NATS subjects forbid whitespace, + so without sanitization every Meraki MS port (Port 1, Port 2, ...) + drops its link-state events. The publisher collapses whitespace runs + to '_' so the subject is valid; the payload retains the original + interface name for consumers that care about the human-readable form. + """ + bus = InMemoryEventBus() + pub = SensoryPublisher(bus, source="snmp_poll") + seen: list[Any] = [] + + async def consume() -> None: + async for msg in bus.subscribe("sensory.link_down.>"): + seen.append(msg) + break + + import asyncio + task = asyncio.create_task(consume()) + await asyncio.sleep(0.01) + + await pub.publish( + "link_down", "cpn-arlington-ms1|Port 3", + payload={"device": "cpn-arlington-ms1", "interface": "Port 3"}, + ) + await asyncio.wait_for(task, timeout=1.0) + + assert seen[0].subject == "sensory.link_down.snmp_poll.cpn-arlington-ms1|Port_3" + # Original whitespace preserved in payload for human-readable display. + assert seen[0].payload["interface"] == "Port 3" + + +@pytest.mark.asyncio +async def test_publish_collapses_multiple_whitespace_runs() -> None: + """Tabs, NBSP, and multi-space runs all collapse to a single '_'.""" + bus = InMemoryEventBus() + pub = SensoryPublisher(bus, source="snmp_poll") + seen: list[Any] = [] + + async def consume() -> None: + async for msg in bus.subscribe("sensory.link_down.>"): + seen.append(msg) + break + + import asyncio + task = asyncio.create_task(consume()) + await asyncio.sleep(0.01) + + await pub.publish( + "link_down", "r1|MS130- 12X\tport", + ) + await asyncio.wait_for(task, timeout=1.0) + + assert seen[0].subject == "sensory.link_down.snmp_poll.r1|MS130-_12X_port" + + +@pytest.mark.asyncio +async def test_publish_still_rejects_dots_in_target_parts() -> None: + """Dots are the NATS token separator — they must remain a hard error. + + Whitespace gets sanitized, dots do NOT. A caller passing 'a.b' as + one target part almost certainly meant 'a' and 'b' as two parts + and silently merging would mask the bug. + """ + bus = InMemoryEventBus() + pub = SensoryPublisher(bus, source="snmp_poll") + with pytest.raises(ValueError, match="contains '.'"): + await pub.publish("link_down", "device.with.dots") + + @pytest.mark.asyncio async def test_publish_preserves_caller_supplied_source_and_recorded_at() -> None: """Caller-supplied source/recorded_at win over the defaults.