diff --git a/packages/device-connect-agent-tools/pyproject.toml b/packages/device-connect-agent-tools/pyproject.toml
index da69ac7..9b603d0 100644
--- a/packages/device-connect-agent-tools/pyproject.toml
+++ b/packages/device-connect-agent-tools/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "device-connect-agent-tools"
-version = "0.2.3"
+version = "0.2.4"
 description = "Framework-agnostic tools for Device Connect — discover and invoke IoT devices over NATS/Zenoh"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/packages/device-connect-edge/device_connect_edge/device.py b/packages/device-connect-edge/device_connect_edge/device.py
index c776d1a..b4d52d9 100644
--- a/packages/device-connect-edge/device_connect_edge/device.py
+++ b/packages/device-connect-edge/device_connect_edge/device.py
@@ -61,6 +61,7 @@ async def capture_image(self, resolution: str = "1080p") -> dict:
 import json
 import logging
 import os
+import random
 import re
 import time
 import uuid
@@ -90,6 +91,44 @@ async def capture_image(self, resolution: str = "1080p") -> dict:
 logger = logging.getLogger(__name__)
 
 
+def _env_float(name: str, default: float) -> float:
+    """Best-effort float env-var parser; falls back to default on garbage."""
+    raw = os.getenv(name)
+    if raw is None or raw == "":
+        return default
+    try:
+        return float(raw)
+    except ValueError:
+        return default
+
+
+# Registration knobs. At fleet scale a 2s request timeout combined with N
+# phones starting in lockstep produces congestion collapse on the
+# registry: every queued-but-late reply triggers a retry that re-enters
+# the queue. A larger timeout lets the registry catch up before any
+# retry fires, and an up-front jitter spreads the initial herd so the
+# registry never sees a synchronized burst in the first place. Both are
+# env-tunable (jitter can be disabled by setting it to 0). The 2s jitter
+# default is a compromise: it decorrelates ~1000 devices into ~500/sec
+# (much better than lockstep) while staying tolerable for single-device
+# development. Operators at fleet scale should bump this via
+# DEVICE_CONNECT_REGISTER_JITTER=10 (or higher) to spread the herd
+# further.
+#
+# Lease-TTL interaction: the registry creates the etcd lease at the
+# moment _do_register runs, so if a slow registry takes ~timeout
+# seconds to reply, the lease can be near-expired before the heartbeat
+# loop emits its first beat (`run()` awaits _register before starting
+# the heartbeat task). With the 15s timeout default and the 15s `ttl`
+# default that race is real; it self-heals — the next heartbeat fires
+# `has_lease()=False` on the registry and triggers a requestRegistration
+# round-trip — but operators raising DEVICE_CONNECT_REGISTER_TIMEOUT
+# (or running a stressed registry where requests routinely take >ttl/3)
+# should raise `ttl` in lockstep or shorten `heartbeat_interval` so
+# the first beat lands inside the lease window.
+_REGISTER_REQUEST_TIMEOUT = _env_float("DEVICE_CONNECT_REGISTER_TIMEOUT", 15.0)
+_REGISTER_STARTUP_JITTER = _env_float("DEVICE_CONNECT_REGISTER_JITTER", 2.0)
+
 
 def build_rpc_response(id_: str, result: Any) -> bytes:
     return json.dumps({"jsonrpc": "2.0", "id": id_, "result": result}).encode()
@@ -974,6 +1013,19 @@ async def _register(self, force: bool = False) -> None:
                 self._logger.debug("Registration completed by another task, skipping")
                 return
 
+            # Spread the herd. With 1000+ phones spinning up in lockstep
+            # the registry sees a single synchronized burst that times
+            # out most callers and amplifies into a retry storm. A small
+            # randomized delay before the first request decorrelates the
+            # arrivals; subsequent retries already have exponential
+            # backoff so we only jitter once per _register call.
+            if _REGISTER_STARTUP_JITTER > 0:
+                jitter = random.uniform(0, _REGISTER_STARTUP_JITTER)
+                self._logger.debug(
+                    "Pre-registration jitter: sleeping %.2fs before first request", jitter,
+                )
+                await asyncio.sleep(jitter)
+
             delay = 1 # initial retry delay in seconds
             while True:
                 req_id = f"{self.device_id}-{int(time.time()*1000)}"
@@ -983,7 +1035,7 @@ async def _register(self, force: bool = False) -> None:
                     response_data = await self.messaging.request(
                         f"device-connect.{self.tenant}.registry",
                         json.dumps({"jsonrpc": "2.0", "id": req_id, "method": "registerDevice", "params": params}).encode(),
-                        timeout=2,
+                        timeout=_REGISTER_REQUEST_TIMEOUT,
                     )
                     self._handle_registration_reply(response_data)
                     # Note: device/online event is published by the registry service
@@ -1762,7 +1814,10 @@ async def _setup_agentic_driver(self) -> None:
         if not isinstance(self._driver, DeviceDriver):
             return
 
-        self._logger.info("Setting up DeviceDriver D2D capabilities")
+        self._logger.info(
+            "Setting up DeviceDriver inter-device messaging "
+            "(router, registry, @on subscriptions)"
+        )
 
         # Create and set D2D router (inline — no orchestration dependency).
         router = _RemoteInvoker(
@@ -1796,7 +1851,11 @@ async def _setup_agentic_driver(self) -> None:
 
         # Set up event subscriptions
         await self._driver.setup_subscriptions()
-        self._logger.info("DeviceDriver D2D setup complete")
+        registry_kind = "D2DRegistry" if self._d2d_mode else "RegistryClient"
+        self._logger.info(
+            "DeviceDriver inter-device messaging ready (registry=%s)",
+            registry_kind,
+        )
 
     async def _teardown_agentic_driver(self) -> None:
         """Teardown DeviceDriver subscriptions if applicable."""
@@ -1825,10 +1884,30 @@ async def _resubscribe_after_reconnect(self) -> None:
         Uses ``_subscription_lock`` to prevent concurrent invocations
         from rapid reconnects.
         """
-        if not self._subscription_lock.acquire_nowait():
+        # Review notes (do not re-litigate without reading these):
+        #
+        # 1. ``asyncio.Lock`` does NOT have ``acquire_nowait()``. That
+        #    was a latent bug in the original implementation — the
+        #    method only exists on ``threading.Lock``. At fleet scale
+        #    during a reconnect storm it raised ``AttributeError`` on
+        #    every reconnect and silently killed @on resubscription.
+        #    See commit 1716f8d.
+        #
+        # 2. The ``locked() then await acquire()`` pattern below looks
+        #    like a TOCTOU race but is safe under single-loop asyncio:
+        #    ``Lock.locked()`` is synchronous and ``Lock.acquire()``
+        #    has a fast path that returns without yielding when the
+        #    lock is free. Two concurrent callers cannot both observe
+        #    ``locked() is False`` between the check and the take
+        #    because there is no event-loop yield in that window.
+        #    If you switch to a multi-loop primitive (anyio, trio,
+        #    threading) this assumption breaks — use ``wait_for(...,
+        #    timeout=0)`` over ``acquire()`` instead.
+        if self._subscription_lock.locked():
             self._logger.debug("Subscription re-establishment already in progress, skipping")
             return
 
+        await self._subscription_lock.acquire()
         try:
             delay = 1
             while True:
diff --git a/packages/device-connect-edge/device_connect_edge/drivers/base.py b/packages/device-connect-edge/device_connect_edge/drivers/base.py
index 73a596b..93d2a00 100644
--- a/packages/device-connect-edge/device_connect_edge/drivers/base.py
+++ b/packages/device-connect-edge/device_connect_edge/drivers/base.py
@@ -1049,16 +1049,47 @@ async def wait_for_device(
     def _collect_event_subscriptions(self) -> List[Dict[str, Any]]:
         """Collect all @on decorated methods.
 
+        Scans single-underscore-prefixed methods as well as public ones so
+        drivers can keep ``@on`` handlers conventionally private without
+        them silently becoming no-ops. Dunders are still skipped.
+
         Returns:
             List of subscription definitions
+
+        Review notes (do not re-litigate without reading):
+        - Skipping all ``_``-prefixed attrs (the original behavior)
+          silently dropped ``@on async def _on_foo`` handlers — Python
+          convention puts callbacks behind ``_`` and drivers expected
+          that to work. Fixed in 0673652.
+        - The ``_is_event_subscription`` marker check below is the
+          authoritative filter; the name prefix is *only* used to skip
+          dunders so we don't resolve descriptors like ``__class__``.
         """
         subscriptions = []
 
+        # We iterate ``dir(self)`` rather than ``__dict__`` so handlers
+        # inherited from a base class are still picked up. The trade-off
+        # is that ``getattr`` here will invoke ``@property`` descriptors,
+        # which may have side effects on driver subclasses (the @on
+        # decorator only marks methods, but properties live in the same
+        # namespace). We swallow exceptions from the resolve step so a
+        # broken / lazy property never breaks subscription setup for an
+        # unrelated handler. ``inspect.getattr_static`` would avoid this
+        # entirely but also bypasses descriptors we *do* want resolved
+        # (classmethod / staticmethod) -- so dynamic ``getattr`` plus a
+        # narrow try/except is the right balance here.
         for attr_name in dir(self):
-            if attr_name.startswith("_"):
+            if attr_name.startswith("__"):
+                continue
+
+            try:
+                attr = getattr(self, attr_name, None)
+            except Exception:
+                # A property raised. Not a subscription candidate (the
+                # @on decorator marks methods, not descriptors) so skip
+                # silently rather than failing the whole driver.
                 continue
 
-            attr = getattr(self, attr_name, None)
             if attr is None or not callable(attr):
                 continue
 
@@ -1225,6 +1256,24 @@ async def _setup_subscription(self, sub: Dict[str, Any]) -> None:
 
         logger.info("[%s] Subscribing to: %s", self_id, subject)
 
+        # device_type filtering relies on the D2D peer cache to resolve the
+        # source device's type. In portal/registry mode there is no peer
+        # cache, so the cache miss path passes the event through unfiltered.
+        # Warn once at setup so subscribers don't silently see events from
+        # other device types. Strict filtering can be added in-handler.
+        if (
+            device_type
+            and not is_lifecycle
+            and getattr(self._device, "_d2d_collector", None) is None
+        ):
+            logger.warning(
+                "[%s] @on(device_type=%r) on %s: device_type filtering is "
+                "best-effort in registry/portal mode. The wildcard broker "
+                "subject delivers every device's matching event; add an "
+                "in-handler type check if you need strict filtering.",
+                self_id, device_type, subject,
+            )
+
         # Use subscribe_with_subject to get the matched subject in callback
         # This allows extracting device_id from wildcard subscriptions
         messaging_client = self._router._messaging
diff --git a/packages/device-connect-edge/device_connect_edge/registry_client.py b/packages/device-connect-edge/device_connect_edge/registry_client.py
index 91dad47..cabf789 100644
--- a/packages/device-connect-edge/device_connect_edge/registry_client.py
+++ b/packages/device-connect-edge/device_connect_edge/registry_client.py
@@ -30,15 +30,24 @@
 import asyncio
 import json
 import logging
+import os
 import time
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from device_connect_edge.messaging.base import MessagingClient
 from device_connect_edge.messaging.exceptions import RequestTimeoutError
 
 logger = logging.getLogger(__name__)
 
+# Per-page chunk size when the client transparently iterates the full fleet.
+# Sized to keep one JSON-RPC reply well under the default NATS max_payload
+# of 1 MB even when device records carry rich function schemas (~10 KB each
+# in the worst case observed): 100 * ~10 KB = ~1 MB, with the actual upper
+# bound for typical records (~6 KB) landing at ~600 KB. Operators on
+# unusually rich schemas can drop this via DEVICE_CONNECT_LIST_PAGE_SIZE.
+_DEFAULT_LIST_PAGE_SIZE = int(os.getenv("DEVICE_CONNECT_LIST_PAGE_SIZE", "100"))
+
 
 class RegistryClient:
     """JSON-RPC client for the device registry service.
@@ -157,8 +166,103 @@ async def list_devices(
                     self._cache, device_type, location, capabilities,
                 )
 
+        # Page through the registry transparently so the wire never carries
+        # a fleet-sized reply (NATS default max_payload is 1 MB and was
+        # being exceeded at ~1400 devices). Older servers that don't
+        # understand ``limit`` just return everything in one reply with
+        # ``next_offset`` absent, so the loop exits after a single
+        # iteration — fully backward compatible.
+        devices: List[Dict[str, Any]] = []
+        offset = 0
+        while True:
+            page, next_offset, _total = await self._list_devices_page(
+                device_type=device_type,
+                location=location,
+                capabilities=capabilities,
+                offset=offset,
+                limit=_DEFAULT_LIST_PAGE_SIZE,
+                timeout=timeout,
+            )
+            devices.extend(page)
+            if next_offset is None:
+                break
+            # Defense-in-depth: a buggy or future server returning a
+            # non-advancing cursor would loop forever otherwise. Break
+            # with a warning so a fleet-scale incident becomes a
+            # recoverable log line.
+            if next_offset <= offset:
+                logger.warning(
+                    "Registry returned non-advancing next_offset=%s (current offset=%s); "
+                    "stopping page walk to avoid infinite loop",
+                    next_offset, offset,
+                )
+                break
+            offset = next_offset
+        logger.debug("Discovered %d devices from registry", len(devices))
+
+        # Update cache (store unfiltered if we fetched without filters)
+        if (
+            self._cache_ttl > 0
+            and device_type is None
+            and location is None
+            and not capabilities
+        ):
+            self._cache = devices
+            self._cache_time = time.time()
+
+        return devices
+
+    async def list_devices_page(
+        self,
+        *,
+        offset: int = 0,
+        limit: int = _DEFAULT_LIST_PAGE_SIZE,
+        device_type: Optional[str] = None,
+        location: Optional[str] = None,
+        capabilities: Optional[List[str]] = None,
+        timeout: Optional[float] = None,
+    ) -> Tuple[List[Dict[str, Any]], Optional[int], int]:
+        """Fetch a single page of devices with pagination metadata.
+
+        Use this when you want to display a paged UI or stream results;
+        most callers should stick with :meth:`list_devices`, which loops
+        internally and returns the full fleet.
+
+        Returns:
+            ``(devices, next_offset, total_matched)`` where ``next_offset``
+            is ``None`` on the final page.
+
+        ACL caveat:
+            When the registry has ACLs enabled, server-side filtering
+            runs *after* slicing. As a result ``len(devices)`` for a
+            given page may be smaller than ``limit`` even when more
+            pages follow, and ``total_matched`` is the unfiltered total
+            (before the caller's ACL applies). UIs should treat
+            ``total_matched`` as an upper bound on what the caller will
+            ever see, and must not assume ``len(devices) == limit``
+            implies a full page.
+        """
+        return await self._list_devices_page(
+            device_type=device_type,
+            location=location,
+            capabilities=capabilities,
+            offset=offset,
+            limit=limit,
+            timeout=timeout,
+        )
+
+    async def _list_devices_page(
+        self,
+        *,
+        device_type: Optional[str],
+        location: Optional[str],
+        capabilities: Optional[List[str]],
+        offset: int,
+        limit: int,
+        timeout: Optional[float],
+    ) -> Tuple[List[Dict[str, Any]], Optional[int], int]:
         subject = f"device-connect.{self._tenant}.discovery"
-        params: Dict[str, Any] = {}
+        params: Dict[str, Any] = {"offset": int(offset), "limit": int(limit)}
         if device_type:
             params["device_type"] = device_type
         if location:
@@ -167,20 +271,12 @@ async def list_devices(
             params["capabilities"] = capabilities
 
         result = await self._request(
-            subject,
-            "discovery/listDevices",
-            params if params else None,
-            timeout,
+            subject, "discovery/listDevices", params, timeout,
         )
         devices = result.get("devices", [])
-        logger.debug("Discovered %d devices from registry", len(devices))
-
-        # Update cache (store unfiltered if we fetched without filters)
-        if self._cache_ttl > 0 and not params:
-            self._cache = devices
-            self._cache_time = time.time()
-
-        return devices
+        next_offset = result.get("next_offset")
+        total = result.get("total_matched", len(devices))
+        return devices, next_offset, total
 
     async def get_device(
         self,
diff --git a/packages/device-connect-edge/pyproject.toml b/packages/device-connect-edge/pyproject.toml
index 8a7a733..2b4a056 100644
--- a/packages/device-connect-edge/pyproject.toml
+++ b/packages/device-connect-edge/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "device-connect-edge"
-version = "0.2.3"
+version = "0.2.4"
 description = "Device Connect Edge — lightweight edge device runtime with Zenoh/NATS messaging and D2D communication"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/packages/device-connect-edge/tests/test_device.py b/packages/device-connect-edge/tests/test_device.py
index 14d03fa..a0fb1b8 100644
--- a/packages/device-connect-edge/tests/test_device.py
+++ b/packages/device-connect-edge/tests/test_device.py
@@ -621,3 +621,57 @@ async def test_request_registration_returns_payload(self):
         assert "identity" in result
         assert "status" in result
         assert "ts" in result["status"]
+
+
+# ── Registration startup jitter ───────────────────────────────────
+
+class TestRegisterStartupJitter:
+    """The pre-registration jitter exists to decorrelate ~1000 phones
+    that boot in lockstep. ``DEVICE_CONNECT_REGISTER_JITTER=0`` is the
+    documented escape hatch for single-device dev (no sleep, no random
+    call); the tests pin both branches of that gate."""
+
+    def _make_runtime(self):
+        rt = DeviceRuntime(
+            driver=StubDriver(),
+            device_id="cam-jit-1",
+            messaging_urls=["nats://localhost:4222"],
+        )
+        rt.messaging = AsyncMock()
+        # _handle_registration_reply expects a valid reply; short-circuit.
+        rt._handle_registration_reply = lambda _data: None
+        rt.messaging.request = AsyncMock(
+            return_value=json.dumps({
+                "jsonrpc": "2.0", "id": "x",
+                "result": {"registration_id": "r1", "device_ttl": 30},
+            }).encode(),
+        )
+        return rt
+
+    @pytest.mark.asyncio
+    async def test_jitter_zero_skips_sleep_and_random(self):
+        rt = self._make_runtime()
+        with patch("device_connect_edge.device._REGISTER_STARTUP_JITTER", 0), \
+             patch("device_connect_edge.device.random.uniform") as mock_uniform, \
+             patch("device_connect_edge.device.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
+            await rt._register(force=True)
+
+        # JITTER=0 must not call random.uniform at all (this is the
+        # contract for single-device dev / deterministic tests).
+        mock_uniform.assert_not_called()
+        # asyncio.sleep is only called from the retry path; since the
+        # registry replied OK on the first try, sleep must not fire.
+        mock_sleep.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_jitter_positive_sleeps_once_before_first_request(self):
+        rt = self._make_runtime()
+        with patch("device_connect_edge.device._REGISTER_STARTUP_JITTER", 4.0), \
+             patch("device_connect_edge.device.random.uniform", return_value=1.23) as mock_uniform, \
+             patch("device_connect_edge.device.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
+            await rt._register(force=True)
+
+        mock_uniform.assert_called_once_with(0, 4.0)
+        # Exactly one sleep (the jitter) — registry reply succeeded on
+        # the first try so the retry-backoff sleep doesn't fire.
+        mock_sleep.assert_awaited_once_with(1.23)
diff --git a/packages/device-connect-edge/tests/test_drivers.py b/packages/device-connect-edge/tests/test_drivers.py
index 9099930..b17b4df 100644
--- a/packages/device-connect-edge/tests/test_drivers.py
+++ b/packages/device-connect-edge/tests/test_drivers.py
@@ -392,6 +392,114 @@ async def disconnect(self):
         subs = driver._collect_event_subscriptions()
         assert len(subs) == 2
 
+    def test_underscore_prefixed_handler_is_still_collected(self):
+        """Single-underscore @on handlers must not silently become no-ops."""
+        class MyDriver(DeviceDriver):
+            device_type = "test"
+
+            @on(device_type="phone", event_name="state_changed")
+            async def _on_phone_state(self, device_id, event_name, payload):
+                pass
+
+            async def connect(self):
+                pass
+
+            async def disconnect(self):
+                pass
+
+        driver = MyDriver()
+        subs = driver._collect_event_subscriptions()
+        assert len(subs) == 1
+        assert subs[0]["device_type"] == "phone"
+        assert subs[0]["event_name"] == "state_changed"
+
+    def test_collector_survives_raising_property(self):
+        """A driver subclass with a @property that raises must not break
+        subscription collection. ``dir()`` surfaces every attribute on
+        the class, and ``getattr`` will invoke descriptors — a buggy or
+        lazy-init property would otherwise crash setup_subscriptions for
+        unrelated handlers."""
+        class MyDriver(DeviceDriver):
+            device_type = "test"
+
+            @property
+            def _not_ready_yet(self):
+                # Simulates a property that depends on connect() having
+                # run, or a hardware probe that fails until init.
+                raise RuntimeError("not ready")
+
+            @on(device_type="phone", event_name="state_changed")
+            async def on_phone_state(self, device_id, event_name, payload):
+                pass
+
+            async def connect(self):
+                pass
+
+            async def disconnect(self):
+                pass
+
+        driver = MyDriver()
+        subs = driver._collect_event_subscriptions()
+        # Property side-effect was tolerated; the real handler still
+        # registered.
+        assert len(subs) == 1
+        assert subs[0]["event_name"] == "state_changed"
+
+    @pytest.mark.asyncio
+    async def test_portal_mode_device_type_filter_warns_at_setup(self, caplog):
+        """In portal/registry mode there is no D2D peer cache to resolve
+        a source device's type, so ``@on(device_type=...)`` filtering
+        silently passes events from other device types through. The
+        driver must emit a single setup-time WARNING so the subscriber
+        sees the gotcha once, not on every event."""
+
+        class MyDriver(DeviceDriver):
+            device_type = "test"
+
+            @on(device_type="camera", event_name="motion")
+            async def on_motion(self, device_id, event_name, payload):
+                pass
+
+            async def connect(self):
+                pass
+
+            async def disconnect(self):
+                pass
+
+        driver = MyDriver()
+
+        mock_messaging = AsyncMock()
+        mock_messaging.subscribe_with_subject = AsyncMock(return_value=MagicMock())
+
+        class FakeRouter:
+            def __init__(self):
+                self._messaging = mock_messaging
+                self._tenant = "default"
+
+        # Portal/registry mode: _device is set, but _d2d_collector is None.
+        # Use a plain object — MagicMock would auto-generate
+        # ``_is_event_subscription`` truthy values and leak phantom
+        # subscriptions into _collect_event_subscriptions.
+        class FakeDevice:
+            _d2d_collector = None
+        driver._device = FakeDevice()
+        driver._device_id = "watcher-1"
+        driver._router = FakeRouter()
+
+        with caplog.at_level("WARNING", logger="device_connect_edge.drivers.base"):
+            await driver.setup_subscriptions()
+
+        warnings = [r for r in caplog.records if r.levelname == "WARNING"]
+        matching = [
+            r for r in warnings
+            if "device_type filtering is" in r.message
+            and "best-effort" in r.message
+        ]
+        assert len(matching) == 1, (
+            f"expected exactly one portal-mode warning, got "
+            f"{[r.message for r in warnings]}"
+        )
+
 
 # ── setup_subscriptions error isolation ───────────────────────────
 
diff --git a/packages/device-connect-edge/tests/test_registry_client.py b/packages/device-connect-edge/tests/test_registry_client.py
index b295924..66dec01 100644
--- a/packages/device-connect-edge/tests/test_registry_client.py
+++ b/packages/device-connect-edge/tests/test_registry_client.py
@@ -99,3 +99,180 @@ async def test_request_raises_after_all_retries_exhausted(self, mock_sleep):
 
         assert messaging.request.call_count == 3
 
+
+class TestListDevicesPagination:
+    """Verify list_devices transparently pages through the registry."""
+
+    @staticmethod
+    def _paged_responses(total: int, page_size: int):
+        """Build the sequence of NATS reply bytes the server would emit."""
+        devices = [{"device_id": f"dev-{i:04d}"} for i in range(total)]
+        responses = []
+        for start in range(0, total, page_size):
+            page = devices[start:start + page_size]
+            end = start + page_size
+            next_offset = end if end < total else None
+            responses.append(json.dumps({
+                "jsonrpc": "2.0",
+                "id": "rpc-test",
+                "result": {
+                    "devices": page,
+                    "next_offset": next_offset,
+                    "total_matched": total,
+                },
+            }).encode())
+        if not responses:
+            # Empty fleet: still need one round-trip
+            responses.append(json.dumps({
+                "jsonrpc": "2.0",
+                "id": "rpc-test",
+                "result": {"devices": [], "next_offset": None, "total_matched": 0},
+            }).encode())
+        return responses
+
+    @pytest.mark.asyncio
+    async def test_list_devices_pages_through_full_fleet(self):
+        """1400 devices should arrive across multiple round-trips."""
+        client, messaging = _make_client()
+        messaging.request = AsyncMock(side_effect=self._paged_responses(1400, 100))
+
+        devices = await client.list_devices()
+
+        assert len(devices) == 1400
+        assert [d["device_id"] for d in devices] == [
+            f"dev-{i:04d}" for i in range(1400)
+        ]
+        # 1400 / 100 = 14 round-trips
+        assert messaging.request.call_count == 14
+
+    @pytest.mark.asyncio
+    async def test_list_devices_passes_offset_and_limit_in_params(self):
+        """Each request must carry the pagination params on the wire."""
+        client, messaging = _make_client()
+        messaging.request = AsyncMock(side_effect=self._paged_responses(250, 100))
+
+        await client.list_devices()
+
+        offsets = []
+        limits = []
+        for call_args in messaging.request.call_args_list:
+            payload = json.loads(call_args.args[1])
+            offsets.append(payload["params"]["offset"])
+            limits.append(payload["params"]["limit"])
+
+        assert offsets == [0, 100, 200]
+        assert all(lim == 100 for lim in limits)
+
+    @pytest.mark.asyncio
+    async def test_list_devices_legacy_server_single_reply(self):
+        """Server without pagination (no next_offset) terminates after 1 call."""
+        client, messaging = _make_client()
+        # Legacy reply shape: devices only, no pagination metadata.
+        legacy = json.dumps({
+            "jsonrpc": "2.0",
+            "id": "rpc-test",
+            "result": {"devices": [{"device_id": "a"}, {"device_id": "b"}]},
+        }).encode()
+        messaging.request = AsyncMock(return_value=legacy)
+
+        devices = await client.list_devices()
+
+        assert len(devices) == 2
+        # next_offset absent => loop exits after one request
+        assert messaging.request.call_count == 1
+
+    @pytest.mark.asyncio
+    async def test_list_devices_page_returns_metadata(self):
+        """list_devices_page exposes next_offset and total_matched to caller."""
+        client, messaging = _make_client()
+        reply = json.dumps({
+            "jsonrpc": "2.0",
+            "id": "rpc-test",
+            "result": {
+                "devices": [{"device_id": "a"}, {"device_id": "b"}],
+                "next_offset": 2,
+                "total_matched": 10,
+            },
+        }).encode()
+        messaging.request = AsyncMock(return_value=reply)
+
+        page, next_offset, total = await client.list_devices_page(
+            offset=0, limit=2,
+        )
+
+        assert len(page) == 2
+        assert next_offset == 2
+        assert total == 10
+
+    @pytest.mark.asyncio
+    async def test_list_devices_forwards_filters(self):
+        """device_type / location filters must accompany pagination params."""
+        client, messaging = _make_client()
+        messaging.request = AsyncMock(side_effect=self._paged_responses(0, 100))
+
+        await client.list_devices(device_type="camera", location="lab-A")
+
+        payload = json.loads(messaging.request.call_args.args[1])
+        assert payload["params"]["device_type"] == "camera"
+        assert payload["params"]["location"] == "lab-A"
+        assert payload["params"]["offset"] == 0
+        assert payload["params"]["limit"] == 100
+
+    @pytest.mark.asyncio
+    async def test_list_devices_handles_empty_page_with_next_offset(self):
+        """ACL filtering can yield an empty page mid-walk with next_offset
+        still pointing forward; the loop must advance, not stall."""
+        client, messaging = _make_client()
+        responses = [
+            # Page 0: ACL filtered everything out, but more pages follow.
+            json.dumps({
+                "jsonrpc": "2.0",
+                "id": "rpc-test",
+                "result": {"devices": [], "next_offset": 100, "total_matched": 200},
+            }).encode(),
+            # Page 1: some visible devices, final page.
+            json.dumps({
+                "jsonrpc": "2.0",
+                "id": "rpc-test",
+                "result": {
+                    "devices": [{"device_id": "visible-1"}],
+                    "next_offset": None,
+                    "total_matched": 200,
+                },
+            }).encode(),
+        ]
+        messaging.request = AsyncMock(side_effect=responses)
+
+        devices = await client.list_devices()
+
+        assert [d["device_id"] for d in devices] == ["visible-1"]
+        assert messaging.request.call_count == 2
+        # Second request must use next_offset from the first reply.
+        second_payload = json.loads(messaging.request.call_args_list[1].args[1])
+        assert second_payload["params"]["offset"] == 100
+
+    @pytest.mark.asyncio
+    async def test_list_devices_breaks_on_non_advancing_next_offset(self, caplog):
+        """A buggy server returning next_offset <= current offset must not
+        spin the client forever — the page loop bails with a warning."""
+        client, messaging = _make_client()
+        # Server bug: keeps returning the same offset.
+        repeating = json.dumps({
+            "jsonrpc": "2.0",
+            "id": "rpc-test",
+            "result": {
+                "devices": [{"device_id": "a"}],
+                "next_offset": 0,
+                "total_matched": 100,
+            },
+        }).encode()
+        messaging.request = AsyncMock(return_value=repeating)
+
+        with caplog.at_level("WARNING"):
+            devices = await client.list_devices()
+
+        assert len(devices) == 1
+        assert messaging.request.call_count == 1
+        assert any(
+            "non-advancing next_offset" in rec.message for rec in caplog.records
+        )
diff --git a/packages/device-connect-server/device_connect_server/portal/app.py b/packages/device-connect-server/device_connect_server/portal/app.py
index 5625140..d68cacb 100644
--- a/packages/device-connect-server/device_connect_server/portal/app.py
+++ b/packages/device-connect-server/device_connect_server/portal/app.py
@@ -73,13 +73,26 @@ async def auth_middleware(request: web.Request, handler):
     session = await _get_session(request)
     if not session.get("username"):
         # Preserve the requested URL so post-login redirect lands the user
-        # back on (e.g.) the CLI approval page.
-        next_url = path
-        if request.query_string:
-            next_url = f"{path}?{request.query_string}"
-        from urllib.parse import quote
-        login_url = "/login?next=" + quote(next_url, safe="") if path != "/login" else "/login"
-        if request.headers.get("HX-Request"):
+        # back on (e.g.) the CLI approval page — but only for top-level
+        # HTML navigations. Background htmx polls and JSON fetches under
+        # /api/ return HTML fragments or JSON, not full pages, so using
+        # them as the post-login destination dumps the user onto a
+        # chrome-less fragment. The dashboard's 10s poll on
+        # /api/devices/live was the original repro: portal restart ->
+        # session lost -> next poll redirected to /login with the poll
+        # URL as ``next`` -> after login the user landed on the raw
+        # fragment instead of the dashboard.
+        is_htmx = request.headers.get("HX-Request") == "true"
+        is_api = path.startswith("/api/")
+        if is_htmx or is_api:
+            login_url = "/login"
+        else:
+            next_url = path
+            if request.query_string:
+                next_url = f"{path}?{request.query_string}"
+            from urllib.parse import quote
+            login_url = "/login?next=" + quote(next_url, safe="") if path != "/login" else "/login"
+        if is_htmx:
             resp = web.Response(status=200)
             resp.headers["HX-Redirect"] = login_url
             return resp
@@ -179,6 +192,10 @@ def create_app() -> web.Application:
 
     # Seed admin on startup
     app.on_startup.append(_on_startup)
+    # Close the cached NATS invoke client on shutdown. Without this the
+    # socket leaks at graceful exit because the client is module-level
+    # state in nats_rpc, not tied to the aiohttp Application lifecycle.
+    app.on_cleanup.append(_on_cleanup)
 
     return app
 
@@ -190,3 +207,12 @@ async def _on_startup(app: web.Application):
         ensure_admin()
     except Exception as e:
         logger.warning("Could not seed admin account (etcd may not be ready): %s", e)
+
+
+async def _on_cleanup(app: web.Application):
+    """Release long-lived resources held at module scope."""
+    try:
+        from .services.nats_rpc import close_invoke_client
+        await close_invoke_client()
+    except Exception as e:
+        logger.warning("Error closing cached NATS invoke client: %s", e)
diff --git a/packages/device-connect-server/device_connect_server/portal/services/credentials.py b/packages/device-connect-server/device_connect_server/portal/services/credentials.py
index 58a7687..6fb118e 100644
--- a/packages/device-connect-server/device_connect_server/portal/services/credentials.py
+++ b/packages/device-connect-server/device_connect_server/portal/services/credentials.py
@@ -64,6 +64,24 @@ def get_credential_data(filename: str) -> dict | None:
         return None
 
 
+def delete_credential(filename: str) -> bool:
+    """Remove a credential file from disk.
+
+    Returns True if a file was deleted, False if no such file existed.
+    Uses the same path-traversal guard as :func:`get_credential`, so a
+    crafted ``filename`` that resolves outside ``CREDS_DIR`` is rejected.
+    """
+    path = get_credential(filename)
+    if not path:
+        return False
+    try:
+        path.unlink()
+        return True
+    except OSError:
+        logger.exception("failed to remove credential %s", filename)
+        return False
+
+
 def get_tenants_summary() -> dict[str, dict]:
     """Get a summary of all tenants and their device counts.
 
diff --git a/packages/device-connect-server/device_connect_server/portal/services/nats_rpc.py b/packages/device-connect-server/device_connect_server/portal/services/nats_rpc.py
index 3c03ce0..82a61b3 100644
--- a/packages/device-connect-server/device_connect_server/portal/services/nats_rpc.py
+++ b/packages/device-connect-server/device_connect_server/portal/services/nats_rpc.py
@@ -4,8 +4,10 @@
 
 """NATS helpers: RPC invocation and event streaming."""
 
+import asyncio
 import json
 import logging
+import time
 import uuid
 from pathlib import Path
 
@@ -18,6 +20,56 @@
 # Registry credentials (privileged, can reach all tenants)
 _REGISTRY_CREDS = Path(config.CREDS_DIR) / "registry.creds.json"
 
+# Long-lived client reused across all invoke() calls. The portal used to
+# open and close a fresh NATS connection per RPC, which added a TCP +
+# JWT-auth handshake to every dashboard "Run" click. The connection is
+# concurrent-safe (each nc.request creates its own inbox subscription)
+# so a single cached client serves the whole portal.
+_invoke_client: "nats.aio.client.Client | None" = None
+
+# Exception types that mean "the cached NATS client is no longer usable"
+# — i.e. the next request must reconnect. We deliberately do NOT include
+# every nats.errors.Error subclass: BadSubjectError, MaxPayloadError,
+# AuthorizationError etc. are caller / payload bugs that don't kill the
+# connection, so dropping the client on them would churn the socket on
+# every malformed request. Native OSError / ConnectionError covers
+# socket-level failures the NATS client may not have wrapped yet.
+#
+# Review notes (do not re-litigate without reading these):
+# - ``ConnectionReconnectingError`` is intentionally absent: it means the
+#   client is *already* reconnecting itself. Dropping + close()-ing in
+#   that state preempts the nats-py reconnect machinery, forces a fresh
+#   handshake on every queued request, and amplifies broker flaps. Let
+#   the existing client recover; the next ``nc.request`` either succeeds
+#   post-reconnect or raises something more terminal that *is* in this
+#   set. Past review round suggested adding it -- don't.
+# - ``ProtocolError`` and ``NoRespondersError`` are payload-level signals
+#   over a healthy socket; covered by their own branches / left to the
+#   default handler without dropping the client. See ``test_nats_rpc``.
+_TRANSPORT_FATAL_ERRORS: tuple = (
+    nats.errors.ConnectionClosedError,
+    nats.errors.ConnectionDrainingError,
+    nats.errors.StaleConnectionError,
+    nats.errors.NoServersError,
+    nats.errors.OutboundBufferLimitError,
+    nats.errors.SecureConnFailedError,
+    ConnectionError,
+    OSError,
+)
+# Lock is created lazily inside _get_invoke_lock() rather than at import
+# time. asyncio.Lock() binds to whatever event loop is current when it's
+# constructed; constructing it here would break tests (and any future
+# code) that runs this module under a fresh loop.
+_invoke_client_lock: "asyncio.Lock | None" = None
+
+
+def _get_invoke_lock() -> asyncio.Lock:
+    """Return the module-level invoke lock, creating it on first use."""
+    global _invoke_client_lock
+    if _invoke_client_lock is None:
+        _invoke_client_lock = asyncio.Lock()
+    return _invoke_client_lock
+
 
 def _load_creds() -> dict:
     """Load registry credentials for NATS auth."""
@@ -27,6 +79,47 @@ def _load_creds() -> dict:
     return {}
 
 
+async def _get_invoke_client():
+    """Lazily open and cache a single NATS client for RPC invocations."""
+    global _invoke_client
+    async with _get_invoke_lock():
+        if _invoke_client is None or _invoke_client.is_closed:
+            _invoke_client = await connect()
+            logger.info("invoke client connected; will be reused across requests")
+        return _invoke_client
+
+
+async def _drop_invoke_client() -> None:
+    """Discard the cached client, best-effort closing whatever's there.
+
+    Called after a hard transport failure so the next invoke() reconnects
+    rather than reusing a half-dead client. The ``close()`` is wrapped in
+    a broad try/except because the connection is already known to be in
+    a bad state — we just want to release sockets if we can.
+    """
+    global _invoke_client
+    async with _get_invoke_lock():
+        stale = _invoke_client
+        _invoke_client = None
+    if stale is not None:
+        try:
+            await stale.close()
+        except Exception:
+            logger.debug("ignored error closing stale invoke client", exc_info=True)
+
+
+async def close_invoke_client() -> None:
+    """Close the cached invoke client at app shutdown.
+
+    Wire this into ``aiohttp.web.Application.on_cleanup``: without it the
+    long-lived socket leaks on graceful shutdown (the cached client is
+    module-level state, not tied to the app's lifecycle). Idempotent —
+    calling twice is a no-op because ``_drop_invoke_client`` nils the
+    global first.
+    """
+    await _drop_invoke_client()
+
+
 async def connect():
     """Return a connected NATS client using registry credentials."""
     creds = _load_creds()
@@ -55,23 +148,42 @@ def _sign(nonce):
 
 async def invoke(tenant: str, device_id: str, function: str, params: dict, timeout: float = 5.0) -> dict:
     """Send a JSON-RPC request to a device and return the response."""
-    nc = await connect()
+    t0 = time.monotonic()
+    subject = f"device-connect.{tenant}.{device_id}.cmd"
+    payload = {
+        "jsonrpc": "2.0",
+        "id": str(uuid.uuid4()),
+        "method": function,
+        "params": params,
+    }
     try:
-        subject = f"device-connect.{tenant}.{device_id}.cmd"
-        payload = {
-            "jsonrpc": "2.0",
-            "id": str(uuid.uuid4()),
-            "method": function,
-            "params": params,
-        }
-
+        nc = await _get_invoke_client()
         msg = await nc.request(subject, json.dumps(payload).encode(), timeout=timeout)
+        logger.info(
+            "invoke %s/%s.%s ok in %.1fms",
+            tenant, device_id, function, (time.monotonic() - t0) * 1000,
+        )
         return json.loads(msg.data)
     except nats.errors.NoRespondersError:
+        logger.warning(
+            "invoke %s/%s.%s no-responders in %.1fms",
+            tenant, device_id, function, (time.monotonic() - t0) * 1000,
+        )
         return {"error": {"code": -1, "message": f"Device {device_id} is not responding"}}
     except nats.errors.TimeoutError:
+        logger.warning(
+            "invoke %s/%s.%s timeout in %.1fms",
+            tenant, device_id, function, (time.monotonic() - t0) * 1000,
+        )
         return {"error": {"code": -2, "message": f"Request timed out after {timeout}s"}}
     except Exception as e:
+        # Only drop the cached client on transport-level failures so a
+        # payload / programmer bug (BadSubject, MaxPayload, KeyError in
+        # our own code, ...) doesn't churn the connection on every call.
+        if isinstance(e, _TRANSPORT_FATAL_ERRORS):
+            await _drop_invoke_client()
+        logger.exception(
+            "invoke %s/%s.%s error in %.1fms: %s",
+            tenant, device_id, function, (time.monotonic() - t0) * 1000, e,
+        )
         return {"error": {"code": -3, "message": str(e)}}
-    finally:
-        await nc.close()
diff --git a/packages/device-connect-server/device_connect_server/portal/services/registry_client.py b/packages/device-connect-server/device_connect_server/portal/services/registry_client.py
index 4599420..2789567 100644
--- a/packages/device-connect-server/device_connect_server/portal/services/registry_client.py
+++ b/packages/device-connect-server/device_connect_server/portal/services/registry_client.py
@@ -37,6 +37,28 @@ def _etcd_client():
     return Etcd3Client(host=config.ETCD_HOST, port=config.ETCD_PORT)
 
 
+def format_live_device(data: dict) -> dict:
+    """Shape a raw etcd device record into the dashboard's row dict.
+
+    Shared between list_live_devices (table render) and the per-device
+    row-html endpoint (used by the dashboard JSON poll when a brand new
+    device appears mid-session). Keeping the formatting in one place
+    means the appended row matches the initial server-rendered rows.
+    """
+    status = data.get("status") or {}
+    identity = data.get("identity") or {}
+    reg = data.get("registry") or {}
+    return {
+        "device_id": data.get("device_id", "unknown"),
+        "device_type": identity.get("device_type", "unknown"),
+        "status": status.get("availability", "unknown"),
+        "location": status.get("location", ""),
+        "last_seen": _format_ts(status.get("ts")) or reg.get("registered_at", ""),
+        "capabilities": data.get("capabilities", {}),
+        "_raw": data,
+    }
+
+
 def list_live_devices(tenant: str) -> list[dict]:
     """Query etcd for all registered devices in a tenant namespace.
 
@@ -47,25 +69,14 @@ def list_live_devices(tenant: str) -> list[dict]:
 
     results = client.get_prefix(prefix)
     devices = []
-    for raw, meta in results:
+    for raw, _meta in results:
         try:
             if isinstance(raw, bytes):
                 raw = raw.decode()
             data = json.loads(raw)
-            status = data.get("status") or {}
-            identity = data.get("identity") or {}
-            reg = data.get("registry") or {}
-            devices.append({
-                "device_id": data.get("device_id", "unknown"),
-                "device_type": identity.get("device_type", "unknown"),
-                "status": status.get("availability", "unknown"),
-                "location": status.get("location", ""),
-                "last_seen": _format_ts(status.get("ts")) or reg.get("registered_at", ""),
-                "capabilities": data.get("capabilities", {}),
-                "_raw": data,
-            })
         except (json.JSONDecodeError, TypeError):
             continue
+        devices.append(format_live_device(data))
 
     return devices
 
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/admin/tenant_detail.html b/packages/device-connect-server/device_connect_server/portal/templates/admin/tenant_detail.html
index eb71d7d..878d80d 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/admin/tenant_detail.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/admin/tenant_detail.html
@@ -11,15 +11,15 @@ <h1 class="text-2xl font-bold text-gray-900">{{ viewing_as }}'s Dashboard</h1>
 <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-8">
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Credentials Created</div>
-    <div class="text-3xl font-bold text-gray-900 mt-1">{{ creds_count }}</div>
+    <div class="text-3xl font-bold text-gray-900 mt-1"><span id="creds-count">{{ creds_count }}</span></div>
   </div>
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Devices Online</div>
-    <div class="text-3xl font-bold text-green-600 mt-1">{{ online_count }}</div>
+    <div class="text-3xl font-bold text-green-600 mt-1"><span id="online-count">{{ online_count }}</span></div>
   </div>
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Devices Registered</div>
-    <div class="text-3xl font-bold text-gray-900 mt-1">{{ registered_count }}</div>
+    <div class="text-3xl font-bold text-gray-900 mt-1"><span id="registered-count">{{ registered_count }}</span></div>
   </div>
 </div>
 
@@ -51,9 +51,12 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
       Auto-refreshing
     </span>
   </div>
+  {# Initial table render only — htmx fires `load` once and the
+     fragment lands here. From then on, /api/devices/live.json drives
+     in-place cell updates. See dashboard.html for the rationale. #}
   <div id="live-devices"
        hx-get="/api/devices/live?tenant={{ viewing_as }}"
-       hx-trigger="load, every 3s"
+       hx-trigger="load"
        hx-swap="innerHTML">
     <div class="px-5 py-8 text-center text-gray-400 text-sm">Loading devices...</div>
   </div>
@@ -61,7 +64,33 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
 
 <script>
 var _tenantParam = '?tenant={{ viewing_as }}';
-window._expandedDevices = window._expandedDevices || new Set();
+window._deviceCapsHash = window._deviceCapsHash || {};
+
+function loadDetailIfNeeded(row, deviceId) {
+  var url = row.getAttribute('data-detail-url');
+  var slot = row.querySelector('.detail-content');
+  if (!url || !slot) return;
+  if (slot.dataset.loaded === 'true' || slot.dataset.loading === 'true') return;
+  slot.dataset.loading = 'true';
+  fetch(url, {credentials: 'same-origin'})
+    .then(function(r) {
+      if (!r.ok) throw new Error('HTTP ' + r.status);
+      return r.text();
+    })
+    .then(function(html) {
+      slot.innerHTML = html;
+      slot.dataset.loaded = 'true';
+      delete slot.dataset.loading;
+    })
+    .catch(function(err) {
+      delete slot.dataset.loading;
+      slot.innerHTML = '';
+      var p = document.createElement('p');
+      p.className = 'text-xs text-red-500';
+      p.textContent = 'Failed to load details: ' + (err && err.message ? err.message : err);
+      slot.appendChild(p);
+    });
+}
 
 function toggleDetail(deviceId) {
   var row = document.getElementById('detail-' + deviceId);
@@ -70,15 +99,142 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
   var opening = row.classList.contains('hidden');
   row.classList.toggle('hidden');
   if (chevron) chevron.style.transform = opening ? 'rotate(90deg)' : '';
-  if (opening) window._expandedDevices.add(deviceId);
-  else window._expandedDevices.delete(deviceId);
+  if (opening) loadDetailIfNeeded(row, deviceId);
 }
 
-document.getElementById('live-devices').addEventListener('htmx:beforeRequest', function(evt) {
-  if (window._expandedDevices.size > 0) {
-    evt.preventDefault();
+// --- In-place poll (replaces the old table-wide htmx swap). See
+//     dashboard.html for the full rationale. ---------------------
+
+function statusPillHtml(status) {
+  if (status === 'available') {
+    return '<span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">' +
+             '<span class="w-1.5 h-1.5 bg-green-500 rounded-full mr-1 pulse-dot"></span>' +
+             'online' +
+           '</span>';
   }
-});
+  var safe = String(status == null ? 'unknown' : status)
+    .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+  return '<span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-600">' +
+           '<span class="w-1.5 h-1.5 bg-gray-400 rounded-full mr-1"></span>' +
+           safe +
+         '</span>';
+}
+
+function updateRowFromJson(d) {
+  var row = document.getElementById('summary-' + d.device_id);
+  if (!row) return false;
+  var statusCell = row.querySelector('.cell-status');
+  if (statusCell) statusCell.innerHTML = statusPillHtml(d.status);
+  var locCell = row.querySelector('.cell-location');
+  if (locCell) locCell.textContent = d.location || '—';
+  var seenCell = row.querySelector('.cell-lastseen');
+  if (seenCell) seenCell.textContent = d.last_seen || '—';
+
+  var prevHash = window._deviceCapsHash[d.device_id];
+  if (prevHash && prevHash !== d.capabilities_hash) {
+    var detailRow = document.getElementById('detail-' + d.device_id);
+    if (detailRow && !detailRow.classList.contains('hidden')) {
+      var slot = detailRow.querySelector('.detail-content');
+      if (slot && slot.dataset.loaded === 'true') {
+        // Close any open stream before refetching: events list may
+        // have changed and the post-refetch panel will be empty.
+        if (typeof closeEventLog === 'function') closeEventLog(d.device_id);
+        delete slot.dataset.loaded;
+        delete slot.dataset.loading;
+        loadDetailIfNeeded(detailRow, d.device_id);
+      }
+    }
+  }
+  window._deviceCapsHash[d.device_id] = d.capabilities_hash;
+  return true;
+}
+
+// See dashboard.html for the orphan-row race this guards against.
+window._pendingInserts = window._pendingInserts || {};
+
+function insertNewDeviceRow(deviceId, capsHash) {
+  if (window._pendingInserts[deviceId]) return;
+  window._pendingInserts[deviceId] = true;
+  fetch('/api/devices/' + encodeURIComponent(deviceId) + '/row-html' + _tenantParam,
+        { credentials: 'same-origin' })
+    .then(function(r) { return r.ok ? r.text() : null; })
+    .then(function(html) {
+      // Cancelled by an interleaved removeDeviceRow — discard.
+      if (!window._pendingInserts[deviceId]) return;
+      delete window._pendingInserts[deviceId];
+      if (!html) return;
+      if (document.getElementById('summary-' + deviceId)) return;
+      var liveTbody = document.querySelector('#live-devices tbody');
+      if (!liveTbody) return;
+      var holder = document.createElement('tbody');
+      holder.innerHTML = html;
+      while (holder.firstChild) liveTbody.appendChild(holder.firstChild);
+      window._deviceCapsHash[deviceId] = capsHash;
+    })
+    .catch(function() { delete window._pendingInserts[deviceId]; });
+}
+
+function removeDeviceRow(deviceId) {
+  var summary = document.getElementById('summary-' + deviceId);
+  var detail = document.getElementById('detail-' + deviceId);
+  if (summary && summary.parentElement) summary.parentElement.removeChild(summary);
+  if (detail && detail.parentElement) detail.parentElement.removeChild(detail);
+  delete window._deviceCapsHash[deviceId];
+  // Cancel any in-flight insert (see dashboard.html comment).
+  delete window._pendingInserts[deviceId];
+  if (typeof closeEventLog === 'function') closeEventLog(deviceId);
+}
+
+function reloadLiveFragment() {
+  fetch('/api/devices/live' + _tenantParam, { credentials: 'same-origin' })
+    .then(function(r) { return r.ok ? r.text() : null; })
+    .then(function(html) {
+      if (html == null) return;
+      var ld = document.getElementById('live-devices');
+      if (ld) ld.innerHTML = html;
+    })
+    .catch(function() {});
+}
+
+function pollDevices() {
+  fetch('/api/devices/live.json' + _tenantParam, { credentials: 'same-origin' })
+    .then(function(r) { return r.ok ? r.json() : null; })
+    .then(function(data) {
+      if (!data) return;
+      var oc = document.getElementById('online-count');
+      if (oc) oc.textContent = data.counts.online;
+      var rc = document.getElementById('registered-count');
+      if (rc) rc.textContent = data.counts.registered;
+      var cc = document.getElementById('creds-count');
+      if (cc) cc.textContent = data.counts.creds;
+
+      var tbody = document.querySelector('#live-devices tbody');
+      var jsonHasDevices = data.devices.length > 0;
+      if (jsonHasDevices !== !!tbody) {
+        reloadLiveFragment();
+        return;
+      }
+      if (!tbody) return;
+
+      var seen = {};
+      data.devices.forEach(function(d) {
+        seen[d.device_id] = true;
+        if (!updateRowFromJson(d)) {
+          insertNewDeviceRow(d.device_id, d.capabilities_hash);
+        }
+      });
+      Array.prototype.forEach.call(
+        tbody.querySelectorAll('tr[id^="summary-"]'),
+        function(row) {
+          var id = row.id.substring('summary-'.length);
+          if (!seen[id]) removeDeviceRow(id);
+        },
+      );
+    })
+    .catch(function() { /* silent */ });
+}
+
+setInterval(pollDevices, 10000);
 
 // --- RPC Invoke ---
 window._invokeState = {};
@@ -158,13 +314,17 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
 }
 
 // --- Event Log (SSE) ---
+// Panel lives in the device's detail row (devices/_live_detail.html).
+// See dashboard.html for the full rationale.
 window._eventSources = {};
 
 function openEventLog(deviceId, eventName) {
   closeEventLog(deviceId);
+
   var panel = document.getElementById('eventlog-' + deviceId);
   var nameEl = document.getElementById('eventlog-name-' + deviceId);
   var entries = document.getElementById('eventlog-entries-' + deviceId);
+  if (!panel || !nameEl || !entries) return;
 
   nameEl.textContent = eventName;
   entries.innerHTML = '<p class="text-gray-400 italic">Listening for events...</p>';
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/auth/cli_approve.html b/packages/device-connect-server/device_connect_server/portal/templates/auth/cli_approve.html
index e75824a..53fd8c5 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/auth/cli_approve.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/auth/cli_approve.html
@@ -8,6 +8,10 @@
 </head>
 <body class="h-full flex items-center justify-center">
   <div class="w-full max-w-lg">
+    <div class="mb-6 rounded-lg p-3 bg-amber-50 border border-amber-300 text-amber-900 text-sm text-center">
+      <p class="font-semibold">Testing version only</p>
+      <p class="text-xs mt-1">This is not a production version. Do not use with real or sensitive data.</p>
+    </div>
     <div class="text-center mb-8">
       <div class="inline-flex items-center justify-center w-12 h-12 bg-indigo-600 rounded-xl mb-4">
         <svg class="w-7 h-7 text-white" fill="none" stroke="currentColor" viewBox="0 0 24 24">
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/base.html b/packages/device-connect-server/device_connect_server/portal/templates/base.html
index 3193190..4d318e1 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/base.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/base.html
@@ -21,6 +21,7 @@
               </svg>
             </div>
             <span class="font-semibold text-gray-900">Device Connect</span>
+            <span class="inline-flex items-center px-2 py-0.5 rounded text-xs font-semibold bg-amber-100 text-amber-800 border border-amber-300" title="This is a testing version, not a production version. Do not use with real or sensitive data.">TESTING ONLY — NOT FOR PRODUCTION</span>
           </a>
           {% if user.role == 'admin' %}
           <a href="/admin" class="text-sm {% if nav == 'admin' %}text-indigo-600 font-medium{% else %}text-gray-500 hover:text-gray-700{% endif %}">Dashboard</a>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/coding_agents/AGENTS.md.j2 b/packages/device-connect-server/device_connect_server/portal/templates/coding_agents/AGENTS.md.j2
index 337928a..da73b36 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/coding_agents/AGENTS.md.j2
+++ b/packages/device-connect-server/device_connect_server/portal/templates/coding_agents/AGENTS.md.j2
@@ -171,8 +171,9 @@ dc-portalctl devices credentials {{ tenant }}-cam-001 \
     --output-file ./{{ tenant }}-cam-001.creds.json
 ```
 
-> ⚠️ **Lifecycle limitations (as of 0.2.3).** The CLI exposes two lifecycle
-> verbs, but neither is implemented in any bundled backend yet:
+> ⚠️ **Lifecycle limitations (as of 0.2.3).** Two of the three lifecycle
+> verbs are not implemented in any bundled backend yet — `revoke` is the
+> one that works:
 >
 > - `dc-portalctl devices revoke-credentials <id>` → `POST
 >   /api/agent/v1/devices/{id}/credentials:rotate` → HTTP **501
@@ -181,12 +182,60 @@ dc-portalctl devices credentials {{ tenant }}-cam-001 \
 > - `dc-portalctl devices delete <id> --confirm` → `DELETE
 >   /api/agent/v1/devices/{id}` → HTTP **501 `not_implemented`** (no
 >   backend implements `remove_device`).
+> - `dc-portalctl devices revoke <id> --confirm` → `POST
+>   /api/agent/v1/devices/{id}/revoke` → **works on every bundled
+>   backend**. When the backend has `remove_device` the broker account
+>   is removed; when it doesn't, the local cred file is still deleted
+>   (soft success) and a `backend_warning` is surfaced in the response
+>   so the operator knows the account itself wasn't touched. This is
+>   the verb to reach for in normal operation.
 >
-> Until those land, to force a device offline stop the device process
-> directly; to drop the registry record, edit the backing store (etcd /
-> file / whichever backend is configured) by hand.
+> If you actually need the rotate or backend-only-delete behavior and
+> your backend doesn't support it, stop the device process directly
+> and/or edit the backing store (etcd / file / whichever backend is
+> configured) by hand.
 
-HTTP equivalents live in [Appendix A](#appendix-a-http-api-reference).
+```bash
+curl -X POST -H "Authorization: Bearer $DEVICE_CONNECT_PORTAL_TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{"device_name":"cam-001","device_type":"camera","location":"warehouse1/loading-dock"}' \
+  {{ portal_url }}/api/agent/v1/devices
+```
+
+### Decommission, rotate, or revoke
+
+Three commands, three different intents — pick the verb that matches what you
+actually want to happen. Only `revoke` works on every bundled backend today
+(see the lifecycle limitations callout above for the 501s on the other two).
+
+| Command | Broker account | Local cred file | When to use |
+|---|---|---|---|
+| `dc-portalctl devices revoke-credentials <id>` | re-issued (rotates) — ⚠ HTTP 501 on bundled backends | replaced with new contents | Same device should come back with a new identity (e.g. suspected leak). |
+| `dc-portalctl devices delete <id> --confirm` | removed — ⚠ HTTP 501 on bundled backends | **left on disk** | Rarely the right choice on its own — even if it landed, the cred file would linger and show up in the portal. |
+| `dc-portalctl devices revoke <id> --confirm` | removed when backend supports it; soft success otherwise | **deleted** | Default for "this credential is gone." The device disconnects on next reconnect and the credential disappears from the portal counter and `/devices` list. |
+
+HTTP equivalents (all need `devices:provision` scope):
+
+```bash
+# Rotate
+curl -X POST -H "Authorization: Bearer $DEVICE_CONNECT_PORTAL_TOKEN" \
+  {{ portal_url }}/api/agent/v1/devices/{{ example_device_id }}/credentials:rotate
+
+# Delete (decommission only)
+curl -X DELETE -H "Authorization: Bearer $DEVICE_CONNECT_PORTAL_TOKEN" \
+  {{ portal_url }}/api/agent/v1/devices/{{ example_device_id }}
+
+# Revoke (the full one — preferred today)
+curl -X POST -H "Authorization: Bearer $DEVICE_CONNECT_PORTAL_TOKEN" \
+  {{ portal_url }}/api/agent/v1/devices/{{ example_device_id }}/revoke
+```
+
+`revoke` returns HTTP 502 `backend_revoke_failed` (and keeps the cred file
+in place) when the backend supports `remove_device` but the broker rejected
+the call — retry once the backend is healthy. On a soft-success backend
+(no `remove_device`) it returns 200 with a `backend_warning` field.
+
+Either decommission flow disconnects the device on its next NATS reconnect.
 
 ---
 
@@ -536,8 +585,9 @@ dc-portalctl devices events       <id>
 dc-portalctl devices provision    <name> --device-type X [--location Y] \
     --creds-output-file <path>
 dc-portalctl devices credentials  <id> --output-file <path>
-dc-portalctl devices revoke-credentials <id>      # ⚠ HTTP 501 on bundled backends (see §3)
-dc-portalctl devices delete       <id> --confirm  # ⚠ HTTP 501 on bundled backends (see §3)
+dc-portalctl devices revoke-credentials <id>           # rotate: ⚠ HTTP 501 on bundled backends (see §3)
+dc-portalctl devices delete       <id> --confirm       # decommission only: ⚠ HTTP 501 on bundled backends (see §3)
+dc-portalctl devices revoke       <id> --confirm       # full revoke (account + cred file): works on every bundled backend
 
 # Invocation
 dc-portalctl devices invoke          <id>          <fn> --params '{…}' --reason "…"
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/dashboard.html b/packages/device-connect-server/device_connect_server/portal/templates/dashboard.html
index 368d9a9..4826b0f 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/dashboard.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/dashboard.html
@@ -10,15 +10,15 @@ <h1 class="text-2xl font-bold text-gray-900">Dashboard</h1>
 <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-8">
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Credentials Created</div>
-    <div class="text-3xl font-bold text-gray-900 mt-1">{{ creds_count }}</div>
+    <div class="text-3xl font-bold text-gray-900 mt-1"><span id="creds-count">{{ creds_count }}</span></div>
   </div>
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Devices Online</div>
-    <div class="text-3xl font-bold text-green-600 mt-1">{{ online_count }}</div>
+    <div class="text-3xl font-bold text-green-600 mt-1"><span id="online-count">{{ online_count }}</span></div>
   </div>
   <div class="bg-white rounded-xl border border-gray-200 p-5">
     <div class="text-sm font-medium text-gray-500">Devices Registered</div>
-    <div class="text-3xl font-bold text-gray-900 mt-1">{{ registered_count }}</div>
+    <div class="text-3xl font-bold text-gray-900 mt-1"><span id="registered-count">{{ registered_count }}</span></div>
   </div>
 </div>
 
@@ -45,34 +45,244 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
       Auto-refreshing
     </span>
   </div>
+  {# Initial table render only — htmx fires `load` once and the
+     fragment lands here. From then on, /api/devices/live.json drives
+     in-place cell updates (status pill, location, last-seen) and
+     counter refresh, so the table DOM is never wholesale swapped
+     and scroll/expand state survives. #}
   <div id="live-devices"
        hx-get="/api/devices/live?tenant={{ tenant }}"
-       hx-trigger="load, every 3s"
+       hx-trigger="load"
        hx-swap="innerHTML">
     <div class="px-5 py-8 text-center text-gray-400 text-sm">Loading devices...</div>
   </div>
 </div>
 
 <script>
-window._expandedDevices = window._expandedDevices || new Set();
+// Tracks the last-seen capabilities hash per device so the JSON poll
+// only refreshes an expanded detail panel when the device's schema
+// actually changed.
+window._deviceCapsHash = window._deviceCapsHash || {};
+
+function loadDetailIfNeeded(row, deviceId) {
+  var url = row.getAttribute('data-detail-url');
+  var slot = row.querySelector('.detail-content');
+  if (!url || !slot) return;
+  if (slot.dataset.loaded === 'true' || slot.dataset.loading === 'true') return;
+  slot.dataset.loading = 'true';
+  fetch(url, {credentials: 'same-origin'})
+    .then(function(r) {
+      if (!r.ok) throw new Error('HTTP ' + r.status);
+      return r.text();
+    })
+    .then(function(html) {
+      slot.innerHTML = html;
+      slot.dataset.loaded = 'true';
+      delete slot.dataset.loading;
+    })
+    .catch(function(err) {
+      delete slot.dataset.loading;
+      // Use textContent on a child so an Error message that happens
+      // to contain HTML (or future server-side error text routed
+      // through this path) can't inject markup.
+      slot.innerHTML = '';
+      var p = document.createElement('p');
+      p.className = 'text-xs text-red-500';
+      p.textContent = 'Failed to load details: ' + (err && err.message ? err.message : err);
+      slot.appendChild(p);
+    });
+}
 
 function toggleDetail(deviceId) {
+  // The row's `hidden` class IS the expansion state — we no longer
+  // need a side Set, because the table DOM is stable across polls.
   var row = document.getElementById('detail-' + deviceId);
   var chevron = document.getElementById('chevron-' + deviceId);
   if (!row) return;
   var opening = row.classList.contains('hidden');
   row.classList.toggle('hidden');
   if (chevron) chevron.style.transform = opening ? 'rotate(90deg)' : '';
-  if (opening) window._expandedDevices.add(deviceId);
-  else window._expandedDevices.delete(deviceId);
+  if (opening) loadDetailIfNeeded(row, deviceId);
+}
+
+// --- In-place poll (replaces the old table-wide htmx swap) ----------
+//
+// /api/devices/live.json returns a JSON snapshot of every device's
+// live values + a capabilities hash, plus the three header counts.
+// We update only the cells that change. The row, the chevron, the
+// expanded-state `class="hidden"` toggle, and the slot content all
+// survive untouched, so scroll position and any open invoke/event-log
+// state stay where the operator put them.
+//
+// Minimum scope: this version does NOT handle a brand new device
+// appearing or an existing device disappearing — those require a page
+// reload to show up. The reported "row collapses on poll" bug is
+// fixed regardless.
+
+function statusPillHtml(status) {
+  if (status === 'available') {
+    return '<span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">' +
+             '<span class="w-1.5 h-1.5 bg-green-500 rounded-full mr-1 pulse-dot"></span>' +
+             'online' +
+           '</span>';
+  }
+  var safe = String(status == null ? 'unknown' : status)
+    .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+  return '<span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-600">' +
+           '<span class="w-1.5 h-1.5 bg-gray-400 rounded-full mr-1"></span>' +
+           safe +
+         '</span>';
+}
+
+function updateRowFromJson(d) {
+  var row = document.getElementById('summary-' + d.device_id);
+  if (!row) return false;  // caller will fetch + insert via insertNewRow
+
+  var statusCell = row.querySelector('.cell-status');
+  if (statusCell) statusCell.innerHTML = statusPillHtml(d.status);
+  var locCell = row.querySelector('.cell-location');
+  if (locCell) locCell.textContent = d.location || '—';
+  var seenCell = row.querySelector('.cell-lastseen');
+  if (seenCell) seenCell.textContent = d.last_seen || '—';
+
+  // Capabilities changed AND the detail is already expanded with
+  // content loaded — refresh the detail in place without collapsing
+  // the row. The event log lives inside the slot, so close any open
+  // stream first: the events list itself may have changed, and the
+  // post-refetch slot HTML will have a fresh empty panel anyway.
+  var prevHash = window._deviceCapsHash[d.device_id];
+  if (prevHash && prevHash !== d.capabilities_hash) {
+    var detailRow = document.getElementById('detail-' + d.device_id);
+    if (detailRow && !detailRow.classList.contains('hidden')) {
+      var slot = detailRow.querySelector('.detail-content');
+      if (slot && slot.dataset.loaded === 'true') {
+        if (typeof closeEventLog === 'function') closeEventLog(d.device_id);
+        delete slot.dataset.loaded;
+        delete slot.dataset.loading;
+        loadDetailIfNeeded(detailRow, d.device_id);
+      }
+    }
+  }
+  window._deviceCapsHash[d.device_id] = d.capabilities_hash;
+  return true;
+}
+
+// In-flight map so a slow row-html fetch doesn't get re-issued by a
+// subsequent poll firing before the first response lands. We track
+// "still wanted" too: if the device disappears from a JSON poll
+// between issuing the fetch and the response landing, we drop the
+// response on the floor instead of appending an orphan row.
+//
+// Race we're guarding against:
+//   poll N    : device X present  -> insertNewDeviceRow(X) starts fetch
+//   poll N+1  : device X absent   -> removeDeviceRow(X) is a no-op
+//                                    (no summary row in DOM yet)
+//   fetch lands: append row for X -> orphan row until next poll
+// With the "still wanted" gate, removeDeviceRow clears the pending
+// flag and the late response is discarded.
+window._pendingInserts = window._pendingInserts || {};
+
+function insertNewDeviceRow(deviceId, capsHash, tbody) {
+  if (window._pendingInserts[deviceId]) return;
+  window._pendingInserts[deviceId] = true;
+  fetch('/api/devices/' + encodeURIComponent(deviceId) + '/row-html?tenant={{ tenant }}',
+        { credentials: 'same-origin' })
+    .then(function(r) { return r.ok ? r.text() : null; })
+    .then(function(html) {
+      // Was this insert cancelled by a removeDeviceRow that ran
+      // between fetch start and response landing? If so the flag is
+      // gone -- discard the response.
+      if (!window._pendingInserts[deviceId]) return;
+      delete window._pendingInserts[deviceId];
+      if (!html) return;
+      // Re-check the DOM: a parallel poll may have already inserted
+      // the row, in which case we must not duplicate it.
+      if (document.getElementById('summary-' + deviceId)) return;
+      var liveTbody = document.querySelector('#live-devices tbody');
+      if (!liveTbody) return;
+      var holder = document.createElement('tbody');
+      holder.innerHTML = html;
+      while (holder.firstChild) liveTbody.appendChild(holder.firstChild);
+      window._deviceCapsHash[deviceId] = capsHash;
+    })
+    .catch(function() { delete window._pendingInserts[deviceId]; });
+}
+
+function removeDeviceRow(deviceId) {
+  var summary = document.getElementById('summary-' + deviceId);
+  var detail = document.getElementById('detail-' + deviceId);
+  if (summary && summary.parentElement) summary.parentElement.removeChild(summary);
+  if (detail && detail.parentElement) detail.parentElement.removeChild(detail);
+  delete window._deviceCapsHash[deviceId];
+  // Cancel any in-flight insertNewDeviceRow fetch for this id: the
+  // response landing after we've decided this device is gone would
+  // otherwise re-add an orphan row.
+  delete window._pendingInserts[deviceId];
+  // Tear down any live SSE drawer for this device.
+  if (typeof closeEventLog === 'function') closeEventLog(deviceId);
 }
 
-// Pause auto-refresh while any detail row is expanded
-document.getElementById('live-devices').addEventListener('htmx:beforeRequest', function(evt) {
-  if (window._expandedDevices.size > 0) {
-    evt.preventDefault();
+// One-shot full-fragment swap, used only across the empty<->non-empty
+// transition where the table itself either doesn't exist yet or has
+// to be replaced by the empty-state placeholder. Incremental updates
+// can't bridge that gap because there's no <tbody> to append to /
+// remove from.
+function reloadLiveFragment() {
+  // Route through htmx (not a raw fetch + innerHTML) so the fragment's
+  // hx-swap-oob counter spans are relocated into the header cards
+  // instead of rendering as literal text below the table.
+  if (window.htmx) {
+    htmx.ajax('GET', '/api/devices/live?tenant={{ tenant }}', {
+      target: '#live-devices',
+      swap: 'innerHTML',
+    });
   }
-});
+}
+
+function pollDevices() {
+  fetch('/api/devices/live.json?tenant={{ tenant }}', { credentials: 'same-origin' })
+    .then(function(r) { return r.ok ? r.json() : null; })
+    .then(function(data) {
+      if (!data) return;
+      var oc = document.getElementById('online-count');
+      if (oc) oc.textContent = data.counts.online;
+      var rc = document.getElementById('registered-count');
+      if (rc) rc.textContent = data.counts.registered;
+      var cc = document.getElementById('creds-count');
+      if (cc) cc.textContent = data.counts.creds;
+
+      var tbody = document.querySelector('#live-devices tbody');
+      var jsonHasDevices = data.devices.length > 0;
+
+      // Empty<->non-empty transitions need to swap the whole fragment
+      // (either we need the table scaffolding or the empty-state div).
+      if (jsonHasDevices !== !!tbody) {
+        reloadLiveFragment();
+        return;
+      }
+      if (!tbody) return;  // empty fleet, empty DOM — nothing to do
+
+      var seen = {};
+      data.devices.forEach(function(d) {
+        seen[d.device_id] = true;
+        if (!updateRowFromJson(d)) {
+          insertNewDeviceRow(d.device_id, d.capabilities_hash, tbody);
+        }
+      });
+
+      // Remove rows for devices that disappeared from the JSON.
+      Array.prototype.forEach.call(
+        tbody.querySelectorAll('tr[id^="summary-"]'),
+        function(row) {
+          var id = row.id.substring('summary-'.length);
+          if (!seen[id]) removeDeviceRow(id);
+        },
+      );
+    })
+    .catch(function() { /* transient failures are silently retried on the next tick */ });
+}
+
+setInterval(pollDevices, 10000);
 
 // --- RPC Invoke ---
 window._invokeState = {};
@@ -154,15 +364,23 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
 }
 
 // --- Event Log (SSE) ---
+//
+// The panel is rendered server-side inside each device's detail row
+// (devices/_live_detail.html). openEventLog finds the existing panel,
+// fills in the event name, un-hides it, and binds an EventSource to
+// the entries div. Safe to live in the detail slot because the
+// JSON-poll architecture only re-fetches that slot on capability
+// change, and updateRowFromJson() closes any open stream first.
 window._eventSources = {};
 
 function openEventLog(deviceId, eventName) {
-  // Close existing stream for this device
+  // Close any existing stream for this device first.
   closeEventLog(deviceId);
 
   var panel = document.getElementById('eventlog-' + deviceId);
   var nameEl = document.getElementById('eventlog-name-' + deviceId);
   var entries = document.getElementById('eventlog-entries-' + deviceId);
+  if (!panel || !nameEl || !entries) return;
 
   nameEl.textContent = eventName;
   entries.innerHTML = '<p class="text-gray-400 italic">Listening for events...</p>';
@@ -172,7 +390,6 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
   window._eventSources[deviceId] = es;
 
   es.onmessage = function(e) {
-    // Remove placeholder
     var placeholder = entries.querySelector('p.italic');
     if (placeholder) placeholder.remove();
 
@@ -198,11 +415,7 @@ <h2 class="text-lg font-semibold text-gray-900">Live Devices</h2>
     };
 
     entries.appendChild(row);
-
-    // Cap at 100 entries
     while (entries.children.length > 100) entries.removeChild(entries.firstChild);
-
-    // Auto-scroll
     entries.scrollTop = entries.scrollHeight;
   };
 
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row.html b/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row.html
index 4d397d4..275454e 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row.html
@@ -1,4 +1,12 @@
-<div class="px-5 py-4 border-b border-gray-100 hover:bg-gray-50 flex items-center justify-between bg-green-50" style="transition: background-color 2s ease-out">
+{# Single device credential row, used by both the /devices listing
+   and the create-device htmx response. `highlight` (default false)
+   toggles a brief green flash for newly-created rows. The wrapping
+   id="cred-row-{device_id}" gives the Revoke button a stable
+   hx-target for `hx-swap="delete"` to remove just this row on a
+   successful revoke. #}
+<div id="cred-row-{{ cred.device_id }}"
+     class="cred-row px-5 py-4 border-b border-gray-100 last:border-b-0 hover:bg-gray-50 flex items-center justify-between{% if highlight %} bg-green-50{% endif %}"
+     {% if highlight %}style="transition: background-color 2s ease-out"{% endif %}>
   <div>
     <div class="text-sm font-medium text-gray-900">{{ cred.device_id }}</div>
     <div class="text-xs text-gray-400 mt-0.5">{{ cred.filename }}</div>
@@ -9,5 +17,13 @@
       <svg class="w-3.5 h-3.5 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4"/></svg>
       Download
     </a>
+    <button hx-post="/api/devices/{{ cred.device_id }}/revoke"
+            hx-confirm="Revoke {{ cred.device_id }}? The device will be disconnected from the broker and its credential file deleted. This cannot be undone."
+            hx-target="#cred-row-{{ cred.device_id }}"
+            hx-swap="delete"
+            class="inline-flex items-center px-3 py-1.5 bg-red-50 text-red-700 text-xs font-medium rounded-lg hover:bg-red-100 transition-colors">
+      <svg class="w-3.5 h-3.5 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 7l-.867 12.142A2 2 0 0116.138 21H7.862a2 2 0 01-1.995-1.858L5 7m5 4v6m4-6v6M1 7h22M9 7V4a1 1 0 011-1h4a1 1 0 011 1v3"/></svg>
+      Revoke
+    </button>
   </div>
 </div>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row_pair.html b/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row_pair.html
new file mode 100644
index 0000000..b3d372d
--- /dev/null
+++ b/packages/device-connect-server/device_connect_server/portal/templates/devices/_device_row_pair.html
@@ -0,0 +1,56 @@
+{# Summary + detail row pair for one device.
+
+   Used in two places:
+     1. `_live_table.html` — server-side render of the full table on
+        initial page load and on the one-shot htmx `load` fragment swap.
+     2. The `/api/devices/{id}/row-html` endpoint — fetched by the
+        dashboard JSON poll when a brand new device appears so we can
+        append a fresh row pair without forcing a page reload.
+
+   Both summary cells (.cell-status / .cell-location / .cell-lastseen)
+   and the row id (#summary-{device_id}) are the same hooks the JSON
+   poll updates in place on subsequent ticks. #}
+<tr id="summary-{{ device.device_id }}"
+    class="hover:bg-gray-50 cursor-pointer"
+    onclick="toggleDetail('{{ device.device_id }}')">
+  <td class="px-5 py-3">
+    <div class="flex items-center gap-2">
+      <svg id="chevron-{{ device.device_id }}" class="w-4 h-4 text-gray-400 transition-transform duration-200" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+        <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5l7 7-7 7"/>
+      </svg>
+      <span class="text-sm font-medium text-gray-900">{{ device.device_id }}</span>
+    </div>
+  </td>
+  <td class="px-5 py-3">
+    <span class="text-sm text-gray-600">{{ device.device_type }}</span>
+  </td>
+  <td class="px-5 py-3 cell-status">
+    {% if device.status == "available" %}
+    <span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
+      <span class="w-1.5 h-1.5 bg-green-500 rounded-full mr-1 pulse-dot"></span>
+      online
+    </span>
+    {% else %}
+    <span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-600">
+      <span class="w-1.5 h-1.5 bg-gray-400 rounded-full mr-1"></span>
+      {{ device.status }}
+    </span>
+    {% endif %}
+  </td>
+  <td class="px-5 py-3 text-sm text-gray-600 cell-location">{{ device.location or "—" }}</td>
+  <td class="px-5 py-3 text-sm text-gray-400 cell-lastseen">{{ device.last_seen or "—" }}</td>
+</tr>
+<tr id="detail-{{ device.device_id }}" class="hidden"
+    data-detail-url="/api/devices/{{ device.device_id }}/live-detail{% if tenant %}?tenant={{ tenant }}{% endif %}">
+  <td colspan="5" class="px-5 py-4 bg-gray-50">
+    {# Per-device detail is lazy-loaded on first expand by
+       toggleDetail() in dashboard.html. We deliberately do not use
+       ``hx-trigger=revealed``: htmx's revealed implementation polls
+       getBoundingClientRect, and display:none elements report a
+       zero-rect at (0,0) which falsely counts as "in viewport",
+       so revealed would fire for every hidden row on page load. #}
+    <div class="detail-content">
+      <p class="text-xs text-gray-400 italic">Loading details&hellip;</p>
+    </div>
+  </td>
+</tr>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_detail.html b/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_detail.html
new file mode 100644
index 0000000..1f84015
--- /dev/null
+++ b/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_detail.html
@@ -0,0 +1,119 @@
+<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
+
+  {# Capabilities: functions #}
+  {% set caps = device.capabilities or {} %}
+  <div>
+    <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Functions (RPCs)</h4>
+    {% if caps.functions %}
+    <div class="space-y-1.5">
+      {% for fn in caps.functions %}
+      <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
+        <code class="text-xs font-medium text-indigo-700">{{ fn.name }}({% for pname, pspec in (fn.parameters.properties or {}).items() %}{{ pname }}{% if pspec.default is defined %}={{ pspec.default | tojson }}{% endif %}{% if not loop.last %}, {% endif %}{% endfor %})</code>
+        {% if fn.description %}
+        <p class="text-xs text-gray-500 mt-0.5">{{ fn.description }}</p>
+        {% endif %}
+        <button onclick="event.stopPropagation(); openInvoke('{{ device.device_id }}', '{{ fn.name }}', {{ fn.parameters | tojson_pretty | e }})"
+                class="mt-1.5 text-xs text-indigo-600 hover:text-indigo-800 font-medium">
+          Try &rarr;
+        </button>
+      </div>
+      {% endfor %}
+    </div>
+    {% else %}
+    <p class="text-xs text-gray-400 italic">No RPC functions registered</p>
+    {% endif %}
+  </div>
+
+  {# Capabilities: events #}
+  <div>
+    <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Events</h4>
+    {% if caps.events %}
+    <div class="space-y-1.5">
+      {% for ev in caps.events %}
+      <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
+        <code class="text-xs font-medium text-green-700">{{ ev.name }}</code>
+        {% if ev.description %}
+        <p class="text-xs text-gray-500 mt-0.5">{{ ev.description }}</p>
+        {% endif %}
+        <button onclick="event.stopPropagation(); openEventLog('{{ device.device_id }}', '{{ ev.name }}')"
+                class="mt-1.5 text-xs text-green-600 hover:text-green-800 font-medium">
+          Live log &rarr;
+        </button>
+      </div>
+      {% endfor %}
+    </div>
+    {% else %}
+    <p class="text-xs text-gray-400 italic">No events registered</p>
+    {% endif %}
+  </div>
+
+  {# Identity #}
+  {% set ident = device._raw.identity or {} if device._raw else {} %}
+  <div>
+    <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Identity</h4>
+    <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
+      <dl class="space-y-1 text-xs">
+        {% for key in ['manufacturer', 'model', 'firmware_version', 'serial_number', 'arch'] %}
+        {% if ident.get(key) %}
+        <div class="flex justify-between">
+          <dt class="text-gray-500">{{ key | replace('_', ' ') | title }}</dt>
+          <dd class="text-gray-900 font-medium">{{ ident[key] }}</dd>
+        </div>
+        {% endif %}
+        {% endfor %}
+      </dl>
+    </div>
+  </div>
+
+</div>
+
+{# Invoke panel (shown by Try button) #}
+<div id="invoke-{{ device.device_id }}" class="hidden mt-4 bg-white rounded-lg border border-indigo-200 p-4">
+  <div class="flex items-center justify-between mb-3">
+    <h4 class="text-xs font-semibold text-indigo-700 uppercase tracking-wider">
+      Invoke <code id="invoke-fn-{{ device.device_id }}" class="text-sm normal-case"></code>
+    </h4>
+    <button onclick="event.stopPropagation(); closeInvoke('{{ device.device_id }}')"
+            class="text-gray-400 hover:text-gray-600 text-xs">&times; Close</button>
+  </div>
+  <div id="invoke-params-{{ device.device_id }}" class="space-y-2 mb-3"></div>
+  <div class="flex items-center gap-3">
+    <button onclick="event.stopPropagation(); submitInvoke('{{ device.device_id }}')"
+            class="px-3 py-1.5 bg-indigo-600 text-white text-xs font-medium rounded-lg hover:bg-indigo-700 transition-colors">
+      Run
+    </button>
+    <span id="invoke-status-{{ device.device_id }}" class="text-xs text-gray-400"></span>
+  </div>
+  <pre id="invoke-result-{{ device.device_id }}" class="hidden mt-3 bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-xs max-h-48 overflow-y-auto"></pre>
+</div>
+
+{# Event log panel (shown by Live log button).
+
+   Lives inside the detail row so the log appears next to the device
+   it's streaming from — important UX at 100s of devices. Safe to
+   keep here because the JSON-poll architecture leaves the detail
+   slot untouched on every 10s tick; the only way the slot gets
+   re-fetched is a capability hash change, at which point the
+   client closes any open stream before the refetch (the events
+   list might have changed too). #}
+<div id="eventlog-{{ device.device_id }}" class="hidden mt-4 bg-white rounded-lg border border-green-200 p-4">
+  <div class="flex items-center justify-between mb-3">
+    <h4 class="text-xs font-semibold text-green-700 uppercase tracking-wider">
+      <span class="inline-flex items-center">
+        <span class="w-2 h-2 bg-green-400 rounded-full pulse-dot mr-1.5"></span>
+        Event log: <code id="eventlog-name-{{ device.device_id }}" class="text-sm normal-case ml-1"></code>
+      </span>
+    </h4>
+    <button onclick="event.stopPropagation(); closeEventLog('{{ device.device_id }}')"
+            class="text-gray-400 hover:text-gray-600 text-xs">&times; Close</button>
+  </div>
+  <div id="eventlog-entries-{{ device.device_id }}" class="space-y-1 max-h-64 overflow-y-auto text-xs">
+    <p class="text-gray-400 italic">Listening for events...</p>
+  </div>
+</div>
+
+{# Collapsible raw JSON #}
+<details class="mt-4">
+  <summary class="text-xs text-gray-400 hover:text-gray-600 cursor-pointer select-none">Raw registration data</summary>
+  <pre class="mt-2 bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-xs max-h-80 overflow-y-auto">{{ device._raw | tojson_pretty }}</pre>
+</details>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_table.html b/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_table.html
index 9eb4901..2d5cd26 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_table.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/devices/_live_table.html
@@ -10,151 +10,7 @@
     </tr>
   </thead>
   <tbody class="divide-y divide-gray-100">
-    {% for device in devices %}
-    <tr class="hover:bg-gray-50 cursor-pointer" onclick="toggleDetail('{{ device.device_id }}')">
-      <td class="px-5 py-3">
-        <div class="flex items-center gap-2">
-          <svg id="chevron-{{ device.device_id }}" class="w-4 h-4 text-gray-400 transition-transform duration-200" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5l7 7-7 7"/>
-          </svg>
-          <span class="text-sm font-medium text-gray-900">{{ device.device_id }}</span>
-        </div>
-      </td>
-      <td class="px-5 py-3">
-        <span class="text-sm text-gray-600">{{ device.device_type }}</span>
-      </td>
-      <td class="px-5 py-3">
-        {% if device.status == "available" %}
-        <span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
-          <span class="w-1.5 h-1.5 bg-green-500 rounded-full mr-1 pulse-dot"></span>
-          online
-        </span>
-        {% else %}
-        <span class="inline-flex items-center px-2 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-600">
-          <span class="w-1.5 h-1.5 bg-gray-400 rounded-full mr-1"></span>
-          {{ device.status }}
-        </span>
-        {% endif %}
-      </td>
-      <td class="px-5 py-3 text-sm text-gray-600">{{ device.location or "—" }}</td>
-      <td class="px-5 py-3 text-sm text-gray-400">{{ device.last_seen or "—" }}</td>
-    </tr>
-    <tr id="detail-{{ device.device_id }}" class="hidden">
-      <td colspan="5" class="px-5 py-4 bg-gray-50">
-        <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
-
-          {# Capabilities: functions #}
-          {% set caps = device.capabilities or {} %}
-          <div>
-            <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Functions (RPCs)</h4>
-            {% if caps.functions %}
-            <div class="space-y-1.5">
-              {% for fn in caps.functions %}
-              <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
-                <code class="text-xs font-medium text-indigo-700">{{ fn.name }}({% for pname, pspec in (fn.parameters.properties or {}).items() %}{{ pname }}{% if pspec.default is defined %}={{ pspec.default | tojson }}{% endif %}{% if not loop.last %}, {% endif %}{% endfor %})</code>
-                {% if fn.description %}
-                <p class="text-xs text-gray-500 mt-0.5">{{ fn.description }}</p>
-                {% endif %}
-                <button onclick="event.stopPropagation(); openInvoke('{{ device.device_id }}', '{{ fn.name }}', {{ fn.parameters | tojson_pretty | e }})"
-                        class="mt-1.5 text-xs text-indigo-600 hover:text-indigo-800 font-medium">
-                  Try &rarr;
-                </button>
-              </div>
-              {% endfor %}
-            </div>
-            {% else %}
-            <p class="text-xs text-gray-400 italic">No RPC functions registered</p>
-            {% endif %}
-          </div>
-
-          {# Capabilities: events #}
-          <div>
-            <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Events</h4>
-            {% if caps.events %}
-            <div class="space-y-1.5">
-              {% for ev in caps.events %}
-              <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
-                <code class="text-xs font-medium text-green-700">{{ ev.name }}</code>
-                {% if ev.description %}
-                <p class="text-xs text-gray-500 mt-0.5">{{ ev.description }}</p>
-                {% endif %}
-                <button onclick="event.stopPropagation(); openEventLog('{{ device.device_id }}', '{{ ev.name }}')"
-                        class="mt-1.5 text-xs text-green-600 hover:text-green-800 font-medium">
-                  Live log &rarr;
-                </button>
-              </div>
-              {% endfor %}
-            </div>
-            {% else %}
-            <p class="text-xs text-gray-400 italic">No events registered</p>
-            {% endif %}
-          </div>
-
-          {# Identity #}
-          {% set ident = device._raw.identity or {} if device._raw else {} %}
-          <div>
-            <h4 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-2">Identity</h4>
-            <div class="bg-white rounded-lg border border-gray-200 px-3 py-2">
-              <dl class="space-y-1 text-xs">
-                {% for key in ['manufacturer', 'model', 'firmware_version', 'serial_number', 'arch'] %}
-                {% if ident.get(key) %}
-                <div class="flex justify-between">
-                  <dt class="text-gray-500">{{ key | replace('_', ' ') | title }}</dt>
-                  <dd class="text-gray-900 font-medium">{{ ident[key] }}</dd>
-                </div>
-                {% endif %}
-                {% endfor %}
-              </dl>
-            </div>
-          </div>
-
-        </div>
-
-        {# Invoke panel (shown by Try button) #}
-        <div id="invoke-{{ device.device_id }}" class="hidden mt-4 bg-white rounded-lg border border-indigo-200 p-4">
-          <div class="flex items-center justify-between mb-3">
-            <h4 class="text-xs font-semibold text-indigo-700 uppercase tracking-wider">
-              Invoke <code id="invoke-fn-{{ device.device_id }}" class="text-sm normal-case"></code>
-            </h4>
-            <button onclick="event.stopPropagation(); closeInvoke('{{ device.device_id }}')"
-                    class="text-gray-400 hover:text-gray-600 text-xs">&times; Close</button>
-          </div>
-          <div id="invoke-params-{{ device.device_id }}" class="space-y-2 mb-3"></div>
-          <div class="flex items-center gap-3">
-            <button onclick="event.stopPropagation(); submitInvoke('{{ device.device_id }}')"
-                    class="px-3 py-1.5 bg-indigo-600 text-white text-xs font-medium rounded-lg hover:bg-indigo-700 transition-colors">
-              Run
-            </button>
-            <span id="invoke-status-{{ device.device_id }}" class="text-xs text-gray-400"></span>
-          </div>
-          <pre id="invoke-result-{{ device.device_id }}" class="hidden mt-3 bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-xs max-h-48 overflow-y-auto"></pre>
-        </div>
-
-        {# Event log panel (shown by Live log button) #}
-        <div id="eventlog-{{ device.device_id }}" class="hidden mt-4 bg-white rounded-lg border border-green-200 p-4">
-          <div class="flex items-center justify-between mb-3">
-            <h4 class="text-xs font-semibold text-green-700 uppercase tracking-wider">
-              <span class="inline-flex items-center">
-                <span class="w-2 h-2 bg-green-400 rounded-full pulse-dot mr-1.5"></span>
-                Event log: <code id="eventlog-name-{{ device.device_id }}" class="text-sm normal-case ml-1"></code>
-              </span>
-            </h4>
-            <button onclick="event.stopPropagation(); closeEventLog('{{ device.device_id }}')"
-                    class="text-gray-400 hover:text-gray-600 text-xs">&times; Close</button>
-          </div>
-          <div id="eventlog-entries-{{ device.device_id }}" class="space-y-1 max-h-64 overflow-y-auto text-xs">
-            <p class="text-gray-400 italic">Listening for events...</p>
-          </div>
-        </div>
-
-        {# Collapsible raw JSON #}
-        <details class="mt-4">
-          <summary class="text-xs text-gray-400 hover:text-gray-600 cursor-pointer select-none">Raw registration data</summary>
-          <pre class="mt-2 bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-xs max-h-80 overflow-y-auto">{{ device._raw | tojson_pretty }}</pre>
-        </details>
-      </td>
-    </tr>
-    {% endfor %}
+    {% for device in devices %}{% include "devices/_device_row_pair.html" %}{% endfor %}
   </tbody>
 </table>
 
@@ -167,3 +23,10 @@ <h4 class="text-xs font-semibold text-green-700 uppercase tracking-wider">
   <p class="text-xs text-gray-400 mt-1">Devices will appear here as they connect to the network</p>
 </div>
 {% endif %}
+
+{# Out-of-band card refresh: each htmx swap of #live-devices also
+   updates the header counters in place, so they don't freeze at the
+   value captured during the initial dashboard render. #}
+<span id="online-count" hx-swap-oob="innerHTML">{{ online_count }}</span>
+<span id="registered-count" hx-swap-oob="innerHTML">{{ registered_count }}</span>
+<span id="creds-count" hx-swap-oob="innerHTML">{{ creds_count }}</span>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/devices/list.html b/packages/device-connect-server/device_connect_server/portal/templates/devices/list.html
index 36b6103..eb02c7e 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/devices/list.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/devices/list.html
@@ -33,21 +33,7 @@ <h2 class="text-lg font-semibold text-gray-900">Device Credentials</h2>
   </div>
   <div id="device-list">
     {% if credentials %}
-    {% for cred in credentials %}
-    <div class="px-5 py-4 border-b border-gray-100 last:border-b-0 hover:bg-gray-50 flex items-center justify-between">
-      <div>
-        <div class="text-sm font-medium text-gray-900">{{ cred.device_id }}</div>
-        <div class="text-xs text-gray-400 mt-0.5">{{ cred.filename }}</div>
-      </div>
-      <div class="flex items-center gap-3">
-        <a href="/api/devices/{{ cred.device_id }}/creds"
-           class="inline-flex items-center px-3 py-1.5 bg-gray-100 text-gray-700 text-xs font-medium rounded-lg hover:bg-gray-200 transition-colors">
-          <svg class="w-3.5 h-3.5 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4"/></svg>
-          Download
-        </a>
-      </div>
-    </div>
-    {% endfor %}
+    {% for cred in credentials %}{% include "devices/_device_row.html" %}{% endfor %}
     {% else %}
     <div class="px-5 py-8 text-center">
       <p class="text-sm text-gray-500">No device credentials yet</p>
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/login.html b/packages/device-connect-server/device_connect_server/portal/templates/login.html
index 6cb95fc..0bddc5f 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/login.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/login.html
@@ -9,6 +9,10 @@
 </head>
 <body class="h-full flex items-center justify-center">
   <div class="w-full max-w-sm">
+    <div class="mb-6 rounded-lg p-3 bg-amber-50 border border-amber-300 text-amber-900 text-sm text-center">
+      <p class="font-semibold">Testing version only</p>
+      <p class="text-xs mt-1">This is not a production version. Do not use with real or sensitive data.</p>
+    </div>
     <div class="text-center mb-8">
       <div class="inline-flex items-center justify-center w-12 h-12 bg-indigo-600 rounded-xl mb-4">
         <svg class="w-7 h-7 text-white" fill="none" stroke="currentColor" viewBox="0 0 24 24">
diff --git a/packages/device-connect-server/device_connect_server/portal/templates/signup.html b/packages/device-connect-server/device_connect_server/portal/templates/signup.html
index b857fae..46f93b7 100644
--- a/packages/device-connect-server/device_connect_server/portal/templates/signup.html
+++ b/packages/device-connect-server/device_connect_server/portal/templates/signup.html
@@ -9,6 +9,10 @@
 </head>
 <body class="h-full flex items-center justify-center">
   <div class="w-full max-w-sm">
+    <div class="mb-6 rounded-lg p-3 bg-amber-50 border border-amber-300 text-amber-900 text-sm text-center">
+      <p class="font-semibold">Testing version only</p>
+      <p class="text-xs mt-1">This is not a production version. Do not use with real or sensitive data.</p>
+    </div>
     <div class="text-center mb-8">
       <div class="inline-flex items-center justify-center w-12 h-12 bg-indigo-600 rounded-xl mb-4">
         <svg class="w-7 h-7 text-white" fill="none" stroke="currentColor" viewBox="0 0 24 24">
diff --git a/packages/device-connect-server/device_connect_server/portal/views/agent_api.py b/packages/device-connect-server/device_connect_server/portal/views/agent_api.py
index 0aae37c..e458a19 100644
--- a/packages/device-connect-server/device_connect_server/portal/views/agent_api.py
+++ b/packages/device-connect-server/device_connect_server/portal/views/agent_api.py
@@ -72,6 +72,7 @@ def setup_routes(app: web.Application):
     r.add_get(PREFIX + "/devices/{device_id}/events", device_events)
     r.add_get(PREFIX + "/devices/{device_id}/credentials", device_credentials_get)
     r.add_post(PREFIX + "/devices/{device_id}/credentials:rotate", device_credentials_rotate)
+    r.add_post(PREFIX + "/devices/{device_id}/revoke", device_revoke)
     r.add_post(PREFIX + "/devices/{device_id}/invoke", device_invoke)
     r.add_post(PREFIX + "/invoke-with-fallback", invoke_with_fallback)
     r.add_get(
@@ -576,6 +577,78 @@ async def device_credentials_rotate(request: web.Request) -> web.Response:
     return _ok({"filename": filename, "content": cred_data}, trace_id=trace)
 
 
+async def device_revoke(request: web.Request) -> web.Response:
+    """Revoke a device: kill the backend account AND delete the local cred file.
+
+    This is the full-revocation counterpart to ``device_delete`` (which
+    only decommissions the backend account) and to ``credentials:rotate``
+    (which only re-issues). After ``revoke`` the device cannot reconnect
+    and disappears from the portal credentials list.
+
+    Requires ``devices:provision`` — same scope as delete, since the
+    operation is at least as destructive.
+    """
+    trace = _trace_id()
+    _, err = _require_scope(request, "devices:provision")
+    if err:
+        return err
+    tenant, err = _resolve_tenant(request)
+    if err:
+        return err
+
+    device_id = request.match_info["device_id"]
+    full_name = _full_device_name(tenant, device_id)
+    filename = f"{full_name}.creds.json"
+
+    # Same shape as the portal handler: a backend without remove_device
+    # is a soft success (file still deleted so the operator's UI matches
+    # intent), but a hard failure from a backend that DOES support
+    # remove_device leaves the file in place so the operator can retry.
+    # Otherwise we'd risk a ghost file pointing at a still-valid account.
+    backend = get_backend()
+    remove = getattr(backend, "remove_device", None)
+    backend_supported = remove is not None
+    backend_error: str | None = None
+    if backend_supported:
+        try:
+            await remove(tenant, full_name)
+            await backend.reload_broker()
+        except Exception as e:
+            logger.exception("revoke: backend remove failed for %s/%s", tenant, full_name)
+            backend_error = str(e)
+    else:
+        backend_error = f"{backend.backend_name()} backend does not support remove_device"
+
+    if backend_supported and backend_error is not None:
+        # Hard backend failure: report it and keep the file. 502 tells
+        # the caller the upstream broker is the problem, not the portal.
+        return _err(status=502, code="backend_revoke_failed",
+                    message=f"Backend revocation failed for {full_name}: "
+                            f"{backend_error}. Credential file left in place; "
+                            f"retry once the backend is healthy.",
+                    trace_id=trace)
+
+    deleted = credentials_svc.delete_credential(filename)
+    if not deleted:
+        # File was there a moment ago (or the backend successfully
+        # removed it); name the fact that the account is already gone
+        # so the caller knows the partial-success state.
+        message = f"Credential file not found: {filename}"
+        if backend_supported and backend_error is None:
+            message += (
+                ". Backend account was already revoked; only the local "
+                "file is missing."
+            )
+        return _err(status=404, code="not_found",
+                    message=message, trace_id=trace)
+
+    _audit(request, "revoke", trace_id=trace, device_id=full_name)
+    result = {"device_id": full_name, "revoked": True}
+    if backend_error:
+        result["backend_warning"] = backend_error
+    return _ok(result, trace_id=trace)
+
+
 async def device_delete(request: web.Request) -> web.Response:
     """Decommission a device. Requires devices:provision."""
     trace = _trace_id()
diff --git a/packages/device-connect-server/device_connect_server/portal/views/auth.py b/packages/device-connect-server/device_connect_server/portal/views/auth.py
index c2112b1..08ad513 100644
--- a/packages/device-connect-server/device_connect_server/portal/views/auth.py
+++ b/packages/device-connect-server/device_connect_server/portal/views/auth.py
@@ -33,13 +33,26 @@ async def root_page(request: web.Request):
 
 
 def _safe_next(value: str | None) -> str | None:
-    """Accept only relative paths so an attacker can't bounce login to an external site."""
+    """Accept only relative paths to a full-page route.
+
+    Rejects:
+    - external/protocol-relative URLs (open-redirect protection)
+    - CRLF (header-injection protection)
+    - ``/api/`` and ``/static/`` paths — these return JSON / HTML
+      fragments / static assets, not full pages, so they make terrible
+      post-login destinations (the user lands on a chrome-less blob).
+      ``auth_middleware`` already avoids capturing these as ``next``
+      for the dashboard's 10s htmx poll, but a stale link with
+      ``?next=/api/...`` could still arrive here.
+    """
     if not value:
         return None
     if not value.startswith("/") or value.startswith("//"):
         return None
     if "\n" in value or "\r" in value:
         return None
+    if value.startswith("/api/") or value.startswith("/static/"):
+        return None
     return value
 
 
diff --git a/packages/device-connect-server/device_connect_server/portal/views/dashboard.py b/packages/device-connect-server/device_connect_server/portal/views/dashboard.py
index bf702e4..c5133d1 100644
--- a/packages/device-connect-server/device_connect_server/portal/views/dashboard.py
+++ b/packages/device-connect-server/device_connect_server/portal/views/dashboard.py
@@ -5,7 +5,9 @@
 """User dashboard with live device polling, RPC invocation, and event streaming."""
 
 import asyncio
+import hashlib
 import json
+import logging
 import time
 
 import aiohttp_jinja2
@@ -14,14 +16,30 @@
 from ..services import credentials, registry_client
 from ..services.backend import get_backend
 
+logger = logging.getLogger(__name__)
+
 
 def setup_routes(app: web.Application):
     app.router.add_get("/dashboard", dashboard_page)
     app.router.add_get("/api/devices/live", live_devices_fragment)
+    app.router.add_get("/api/devices/live.json", live_devices_json)
+    app.router.add_get("/api/devices/{device_id}/row-html", device_row_html_fragment)
+    app.router.add_get("/api/devices/{device_id}/live-detail", live_device_detail_fragment)
     app.router.add_post("/api/devices/{device_id}/invoke", invoke_device_rpc)
     app.router.add_get("/api/devices/{device_id}/events/{event_name}/stream", event_stream)
 
 
+def _capabilities_hash(caps) -> str:
+    """Stable short hash of a device's capabilities for change detection.
+
+    The dashboard's JSON poll uses this to decide whether to refresh an
+    already-expanded detail panel in place. We hash the canonical
+    JSON form so reordering keys doesn't trigger spurious refreshes.
+    """
+    payload = json.dumps(caps or {}, sort_keys=True, default=str)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:16]
+
+
 async def dashboard_page(request: web.Request):
     user = request["user"]
     tenant = user["tenant"]
@@ -47,21 +65,153 @@ async def dashboard_page(request: web.Request):
 
 
 async def live_devices_fragment(request: web.Request):
-    """Return the live devices table as an HTML fragment for htmx polling."""
+    """Return the live devices table as an HTML fragment for htmx polling.
+
+    Only summary rows are rendered here; the per-device detail markup is
+    lazy-loaded from ``/api/devices/{device_id}/live-detail`` when a row
+    is expanded. At fleet scale the detail blocks dominate response size,
+    so deferring them keeps each poll cheap regardless of fleet size.
+    """
     tenant = _resolve_tenant(request)
 
+    # etcd get_prefix + JSON-decode scales with fleet size; run it off
+    # the event loop so other portal requests aren't blocked on the poll.
     devices = []
     try:
-        devices = registry_client.list_live_devices(tenant)
+        devices = await asyncio.to_thread(
+            registry_client.list_live_devices, tenant,
+        )
     except Exception:
         pass
 
+    # Header card counts are refreshed by piggybacking on this poll via
+    # hx-swap-oob in the template. Without that, dashboard_page renders
+    # them once at first load and they freeze — at fleet scale the user
+    # then sees "1401 online" long after the swarm has shut down.
+    online_count = sum(1 for d in devices if d.get("status") == "available")
+    try:
+        creds_count = len(
+            await asyncio.to_thread(credentials.list_credentials, tenant=tenant),
+        )
+    except Exception:
+        creds_count = 0
+
     return aiohttp_jinja2.render_template("devices/_live_table.html", request, {
         "devices": devices,
+        "tenant": tenant,
+        "online_count": online_count,
+        "registered_count": len(devices),
+        "creds_count": creds_count,
         "user": request.get("user", {}),
     })
 
 
+async def live_devices_json(request: web.Request):
+    """JSON snapshot for the dashboard's in-place poll.
+
+    Replaces the table-wide htmx swap. The client merges these values
+    into existing rows (status pill, location, last-seen) without
+    touching DOM structure, so scroll position and expand/event-log
+    state survive. ``capabilities_hash`` lets the client decide whether
+    an already-expanded detail panel needs re-fetching.
+
+    Cost: ``registry_client.list_live_devices`` paginates through the
+    full tenant fleet via the registry RPC; the registry, in turn,
+    does a full ``etcd get_prefix`` + JSON-decode per page (see
+    ``DeviceRegistry.list_devices_page``). At ~1400 devices with the
+    default 100-device page that's ~14 etcd scans per JSON-poll tick,
+    per dashboard. The pagination fix bounds NATS *payload* size but
+    NOT registry CPU; if the portal is being polled by many concurrent
+    operators, raise ``DEVICE_CONNECT_LIST_PAGE_SIZE`` (with the
+    matching ``DC_LIST_DEVICES_MAX_LIMIT`` and NATS ``max_payload``)
+    or lengthen the client poll interval. Selector pushdown / keyset
+    pagination is the documented next iteration.
+    """
+    tenant = _resolve_tenant(request)
+    devices = []
+    try:
+        devices = await asyncio.to_thread(registry_client.list_live_devices, tenant)
+    except Exception:
+        pass
+    try:
+        creds_count = len(
+            await asyncio.to_thread(credentials.list_credentials, tenant=tenant),
+        )
+    except Exception:
+        creds_count = 0
+
+    return web.json_response({
+        "devices": [
+            {
+                "device_id": d.get("device_id"),
+                "device_type": d.get("device_type"),
+                "status": d.get("status"),
+                "location": d.get("location"),
+                "last_seen": d.get("last_seen"),
+                "capabilities_hash": _capabilities_hash(d.get("capabilities")),
+            }
+            for d in devices
+        ],
+        "counts": {
+            "online": sum(1 for d in devices if d.get("status") == "available"),
+            "registered": len(devices),
+            "creds": creds_count,
+        },
+    })
+
+
+async def device_row_html_fragment(request: web.Request):
+    """Render the summary+detail row pair for one device.
+
+    Called by the dashboard JSON poll when it sees a device_id that
+    isn't yet in the table — the JS appends the returned HTML to
+    ``<tbody>`` instead of triggering a full page reload. The same
+    Jinja partial powers the initial server-side table render, so an
+    appended row is structurally identical to one rendered at page
+    load (same id, same cell classes, same chevron, same lazy-detail
+    URL).
+    """
+    tenant = _resolve_tenant(request)
+    device_id = request.match_info["device_id"]
+
+    raw = await asyncio.to_thread(registry_client.get_device, tenant, device_id)
+    if not raw:
+        return web.Response(status=404, text="", content_type="text/html")
+
+    device = registry_client.format_live_device(raw)
+    return aiohttp_jinja2.render_template(
+        "devices/_device_row_pair.html", request,
+        {"device": device, "tenant": tenant},
+    )
+
+
+async def live_device_detail_fragment(request: web.Request):
+    """Return the per-device detail fragment (functions, events, raw JSON).
+
+    Loaded lazily when a row is expanded — keeps the main polling
+    response O(summary) rather than O(summary + every-device-detail).
+    """
+    tenant = _resolve_tenant(request)
+    device_id = request.match_info["device_id"]
+
+    raw = await asyncio.to_thread(registry_client.get_device, tenant, device_id)
+    if not raw:
+        return web.Response(
+            status=404,
+            text='<p class="text-xs text-red-500">Device not found.</p>',
+            content_type="text/html",
+        )
+
+    device = {
+        "device_id": raw.get("device_id", device_id),
+        "capabilities": raw.get("capabilities") or {},
+        "_raw": raw,
+    }
+    return aiohttp_jinja2.render_template(
+        "devices/_live_detail.html", request, {"device": device},
+    )
+
+
 def _resolve_tenant(request: web.Request) -> str:
     """Get tenant from query param (admin override) or session."""
     from ..services.backend import validate_name
@@ -76,6 +226,7 @@ def _resolve_tenant(request: web.Request) -> str:
 
 async def invoke_device_rpc(request: web.Request):
     """Invoke an RPC function on a device via the active messaging backend."""
+    t0 = time.monotonic()
     tenant = _resolve_tenant(request)
     device_id = request.match_info["device_id"]
 
@@ -91,7 +242,16 @@ async def invoke_device_rpc(request: web.Request):
         return web.json_response({"error": {"message": "function is required"}}, status=400)
 
     backend = get_backend()
+    t_pre_rpc = time.monotonic()
     result = await backend.rpc_invoke(tenant, device_id, function, params)
+    t_post_rpc = time.monotonic()
+    logger.info(
+        "invoke %s/%s.%s handler=%.1fms (pre-rpc=%.1fms rpc=%.1fms)",
+        tenant, device_id, function,
+        (t_post_rpc - t0) * 1000,
+        (t_pre_rpc - t0) * 1000,
+        (t_post_rpc - t_pre_rpc) * 1000,
+    )
     return web.json_response(result)
 
 
diff --git a/packages/device-connect-server/device_connect_server/portal/views/devices.py b/packages/device-connect-server/device_connect_server/portal/views/devices.py
index 7f5bf1e..a5247b9 100644
--- a/packages/device-connect-server/device_connect_server/portal/views/devices.py
+++ b/packages/device-connect-server/device_connect_server/portal/views/devices.py
@@ -23,6 +23,7 @@ def setup_routes(app: web.Application):
     app.router.add_get("/api/devices/agent-creds", download_agent_creds)
     app.router.add_get("/api/devices/demo-bundle", download_demo_bundle)
     app.router.add_get("/api/devices/{name}/creds", download_credential)
+    app.router.add_post("/api/devices/{name}/revoke", revoke_credential)
     app.router.add_get("/api/devices/bundle", download_bundle)
 
 
@@ -138,7 +139,8 @@ async def create_device(request: web.Request):
             content_type="text/html",
         )
 
-    # Return the new row as HTML fragment
+    # Return the new row as HTML fragment, with `highlight` flag on so
+    # the partial paints the brief green flash on the just-created row.
     cred = {
         "device_id": full_name,
         "filename": f"{full_name}.creds.json",
@@ -146,6 +148,7 @@ async def create_device(request: web.Request):
     return aiohttp_jinja2.render_template("devices/_device_row.html", request, {
         "cred": cred,
         "user": user,
+        "highlight": True,
     })
 
 
@@ -175,6 +178,88 @@ async def download_credential(request: web.Request):
     )
 
 
+async def revoke_credential(request: web.Request):
+    """Revoke a device credential: kill the backend account, then delete the file.
+
+    Returns an empty 200 body so an htmx swap can drop the row from the
+    page. The dashboard's JSON poll picks up the lower count and the
+    registry-entry removal on its next 10s tick.
+
+    Auth: portal session, tenant-scoped. Admins may revoke across
+    tenants. Non-admins can only touch their own tenant's creds.
+    """
+    user = request["user"]
+    tenant = user["tenant"]
+    device_name = request.match_info["name"]
+    filename = f"{device_name}.creds.json"
+
+    cred_data = credentials.get_credential_data(filename)
+    if not cred_data:
+        raise web.HTTPNotFound(text=f"Credential file not found: {filename}")
+    cred_tenant = cred_data.get("tenant", "")
+    if cred_tenant != tenant and user.get("role") != "admin":
+        raise web.HTTPForbidden(text="Access denied: credential belongs to another tenant")
+
+    # 1. Revoke the broker account so the device can no longer connect.
+    #    A backend without remove_device is a "soft success": the file
+    #    is still deleted so the operator's UI reflects intent, but a
+    #    real failure from a backend that DOES support remove_device is
+    #    not — leaving a ghost file pointing at a still-valid account
+    #    is worse than leaving an orphan file the operator can retry.
+    backend = get_backend()
+    remove = getattr(backend, "remove_device", None)
+    backend_supported = remove is not None
+    backend_error: str | None = None
+    if backend_supported:
+        try:
+            await remove(cred_tenant, device_name)
+            await backend.reload_broker()
+        except Exception as e:
+            backend_error = str(e)
+    else:
+        backend_error = f"{backend.backend_name()} backend does not support remove_device"
+
+    # 2. Delete the local credential file. Only attempted when the
+    #    backend revocation succeeded (or the backend doesn't support
+    #    it at all); a hard backend failure leaves the file in place
+    #    so the operator can retry once the backend is healthy.
+    if backend_supported and backend_error is not None:
+        # Hard backend failure: surface it, keep the file. The
+        # 502 (rather than 500) tells the operator the upstream
+        # broker is the problem, not the portal.
+        raise web.HTTPBadGateway(
+            text=f"Backend revocation failed for {device_name}: {backend_error}. "
+                 f"Credential file left in place; retry once the backend is healthy.",
+        )
+
+    deleted = credentials.delete_credential(filename)
+    if not deleted:
+        # File was there a moment ago (cred_data was non-None) but we
+        # couldn't unlink it. If the backend revoke already succeeded,
+        # the account is gone — say so explicitly rather than implying
+        # the whole operation failed.
+        suffix = (
+            " Backend account was already revoked; only the local file "
+            "remains and must be removed by hand."
+            if backend_supported and backend_error is None
+            else ""
+        )
+        raise web.HTTPInternalServerError(
+            text=f"Failed to remove credential file: {filename}.{suffix}",
+        )
+
+    # The device's etcd registry entry expires on its TTL after the
+    # device disconnects; no explicit cleanup needed here.
+
+    # Empty body — htmx's `hx-swap="delete"` removes the row from the
+    # page on a 2xx response. Surface backend warnings (e.g. unsupported
+    # backend) as a non-blocking header so the operator at least sees it.
+    headers = {}
+    if backend_error:
+        headers["X-Revoke-Warning"] = backend_error
+    return web.Response(status=200, headers=headers, text="")
+
+
 async def download_bundle(request: web.Request):
     """Download a tenant credential bundle as .zip."""
     user = request["user"]
diff --git a/packages/device-connect-server/device_connect_server/portalctl/cli.py b/packages/device-connect-server/device_connect_server/portalctl/cli.py
index fe4ccbb..00906f3 100644
--- a/packages/device-connect-server/device_connect_server/portalctl/cli.py
+++ b/packages/device-connect-server/device_connect_server/portalctl/cli.py
@@ -425,6 +425,27 @@ async def cmd_devices_delete(client: PortalClient, args) -> int:
     return _exit_for_status(status, body)
 
 
+async def cmd_devices_revoke(client: PortalClient, args) -> int:
+    """Full revoke: kill the backend account and delete the credential file.
+
+    Distinct from ``revoke-credentials`` (which only rotates: re-issues
+    creds, file stays) and from ``delete`` (which decommissions the
+    backend account but leaves the file on disk). After ``revoke`` the
+    cred is gone from the portal's "Credentials Created" count.
+    """
+    if not args.confirm:
+        sys.stderr.write("revoke requires --confirm to proceed\n")
+        return 2
+    params = {"tenant": args.tenant} if args.tenant else None
+    status, body = await client.request(
+        "POST", _device_subpath(args.device_id, "/revoke"), params=params,
+    )
+    _maybe_print_error(status, body)
+    if 200 <= status < 300:
+        _emit(body, args.output)
+    return _exit_for_status(status, body)
+
+
 async def cmd_devices_invoke(client: PortalClient, args) -> int:
     try:
         params_obj = json.loads(args.params) if args.params else {}
@@ -603,12 +624,21 @@ def _build_parser() -> argparse.ArgumentParser:
     d_rev.add_argument("device_id")
     d_rev.set_defaults(func=cmd_devices_revoke_credentials)
 
-    d_del = dsub.add_parser("delete", help="decommission a device")
+    d_del = dsub.add_parser("delete", help="decommission a device (leaves cred file)")
     d_del.add_argument("device_id")
     d_del.add_argument("--confirm", action="store_true",
                        help="required: confirms the destructive action")
     d_del.set_defaults(func=cmd_devices_delete)
 
+    d_revoke = dsub.add_parser(
+        "revoke",
+        help="full revoke: kill backend account AND delete the cred file",
+    )
+    d_revoke.add_argument("device_id")
+    d_revoke.add_argument("--confirm", action="store_true",
+                          help="required: confirms the destructive action")
+    d_revoke.set_defaults(func=cmd_devices_revoke)
+
     d_inv = dsub.add_parser("invoke", help="invoke a device function")
     d_inv.add_argument("device_id")
     d_inv.add_argument("function")
diff --git a/packages/device-connect-server/device_connect_server/registry/client.py b/packages/device-connect-server/device_connect_server/registry/client.py
index 1ba12bd..23f84b6 100644
--- a/packages/device-connect-server/device_connect_server/registry/client.py
+++ b/packages/device-connect-server/device_connect_server/registry/client.py
@@ -29,12 +29,18 @@
 
 import json
 import logging
+import os
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from device_connect_edge.messaging import MessagingClient
 from device_connect_edge.messaging.config import MessagingConfig
 
+# Per-page chunk size when ``list_devices`` transparently iterates the
+# fleet. Matches the edge client default; see device_connect_edge's
+# registry_client._DEFAULT_LIST_PAGE_SIZE for the rationale.
+_DEFAULT_LIST_PAGE_SIZE = int(os.getenv("DEVICE_CONNECT_LIST_PAGE_SIZE", "100"))
+
 
 class RegistryClient:
     """Client for querying the device registry.
@@ -195,9 +201,82 @@ async def list_devices(
             # Get only cameras
             cameras = await registry.list_devices(device_type="camera")
         """
-        subject = f"device-connect.{self._tenant}.discovery"
+        # Page through the registry so the reply never exceeds NATS's
+        # max_payload limit at fleet scale. Older servers that ignore
+        # ``limit`` still work — they return everything in one reply with
+        # ``next_offset`` absent and the loop exits immediately.
+        devices: List[Dict[str, Any]] = []
+        offset = 0
+        while True:
+            page, next_offset, _total = await self._list_devices_page(
+                device_type=device_type,
+                location=location,
+                capabilities=capabilities,
+                offset=offset,
+                limit=_DEFAULT_LIST_PAGE_SIZE,
+                timeout=timeout,
+            )
+            devices.extend(page)
+            if next_offset is None:
+                break
+            # Defense-in-depth: bail out if the server returns a
+            # non-advancing cursor. The current registry can't produce
+            # this, but it would otherwise be an unbounded loop.
+            if next_offset <= offset:
+                self._logger.warning(
+                    "Registry returned non-advancing next_offset=%s "
+                    "(current offset=%s); stopping page walk",
+                    next_offset, offset,
+                )
+                break
+            offset = next_offset
+        self._logger.debug("Listed %d devices", len(devices))
+        return devices
+
+    async def list_devices_page(
+        self,
+        *,
+        offset: int = 0,
+        limit: int = _DEFAULT_LIST_PAGE_SIZE,
+        device_type: Optional[str] = None,
+        location: Optional[str] = None,
+        capabilities: Optional[List[str]] = None,
+        timeout: Optional[float] = None,
+    ) -> Tuple[List[Dict[str, Any]], Optional[int], int]:
+        """Fetch one page of devices with pagination metadata.
+
+        Returns ``(devices, next_offset, total_matched)``; ``next_offset``
+        is ``None`` on the final page.
+
+        ACL caveat:
+            When the registry has ACLs enabled, filtering runs *after*
+            slicing. ``len(devices)`` for a page may be smaller than
+            ``limit`` even when more pages follow, and ``total_matched``
+            is the unfiltered total (before the caller's ACL applies).
+            Callers should treat ``total_matched`` as an upper bound and
+            must not infer "full page" from ``len(devices) == limit``.
+        """
+        return await self._list_devices_page(
+            device_type=device_type,
+            location=location,
+            capabilities=capabilities,
+            offset=offset,
+            limit=limit,
+            timeout=timeout,
+        )
 
-        params: Dict[str, Any] = {}
+    async def _list_devices_page(
+        self,
+        *,
+        device_type: Optional[str],
+        location: Optional[str],
+        capabilities: Optional[List[str]],
+        offset: int,
+        limit: int,
+        timeout: Optional[float],
+    ) -> Tuple[List[Dict[str, Any]], Optional[int], int]:
+        subject = f"device-connect.{self._tenant}.discovery"
+        params: Dict[str, Any] = {"offset": int(offset), "limit": int(limit)}
         if device_type:
             params["device_type"] = device_type
         if location:
@@ -206,15 +285,12 @@ async def list_devices(
             params["capabilities"] = capabilities
 
         result = await self._request(
-            subject,
-            "discovery/listDevices",
-            params if params else None,
-            timeout,
+            subject, "discovery/listDevices", params, timeout,
         )
-
         devices = result.get("devices", [])
-        self._logger.debug("Listed %d devices", len(devices))
-        return devices
+        next_offset = result.get("next_offset")
+        total = result.get("total_matched", len(devices))
+        return devices, next_offset, total
 
     async def get_device(
         self,
diff --git a/packages/device-connect-server/device_connect_server/registry/service/main.py b/packages/device-connect-server/device_connect_server/registry/service/main.py
index 9661d6e..88e7e6c 100644
--- a/packages/device-connect-server/device_connect_server/registry/service/main.py
+++ b/packages/device-connect-server/device_connect_server/registry/service/main.py
@@ -53,6 +53,32 @@
 _DEFAULT_TTL = 15                # Fallback TTL when device doesn't report one
 _PULL_REGISTRATION_TIMEOUT = 5   # Timeout for requestRegistration RPC
 
+# Server-side cap for discovery/listDevices page sizes when the caller
+# opts into pagination by passing `limit`. NATS rejects any single
+# publish larger than the broker's max_payload, so the registry clamps
+# the page size to bound the reply size regardless of what `limit` the
+# caller asked for. Empirically a flashlight-auditorium phone record is
+# ~13KB serialized, so 200 records ~= 2.6MB, which fits comfortably under
+# the 8MB max_payload set in security_infra/setup_deployment.sh while
+# keeping per-page round-trip small.
+#
+# Old clients that omit `limit` entirely fall through to the legacy
+# unpaginated reply path — they keep working at small fleet scale and
+# fail loudly with `max_payload exceeded` at large scale, the same
+# behavior they had before this PR. Silently truncating their reply
+# would be worse: they would parse a partial fleet as if it were
+# complete and act on stale views. Operators hitting the ceiling should
+# upgrade clients to a version that passes `limit`.
+_LIST_DEVICES_MAX_LIMIT = int(os.getenv("DC_LIST_DEVICES_MAX_LIMIT", "200"))
+
+# Track which over-cap ``limit`` values we've already warned about so a
+# misconfigured client doesn't spam the log on every page. Deduped by the
+# requested value, not by caller — the warning is operator-facing
+# ("someone is tuning a knob that no longer matches the server cap"),
+# not per-request observability. Bounded by the number of distinct
+# limits a caller can ask for; in practice 1-2 values.
+_WARNED_LIMIT_CLAMPS: set[int] = set()
+
 
 def _resolve_tenants() -> List[str]:
     """Resolve the list of tenants to handle.
@@ -378,18 +404,161 @@ async def rpc_discovery(data: bytes, reply: Optional[str]):
             if method == "discovery/listDevices":
                 device_type = params.get("device_type")
                 location = params.get("location")
-                devs = await asyncio.to_thread(
-                    registry.list_devices, tenant,
-                    device_type=device_type, location=location,
-                )
+                # Pagination: ``offset`` and ``limit`` are optional.
+                #
+                # If ``limit`` is absent the caller is on the legacy
+                # protocol — return the full filtered fleet in a single
+                # reply. This preserves backward compatibility (small
+                # deployments keep working untouched) and surfaces the
+                # original max_payload failure loudly at fleet scale
+                # rather than silently truncating to a cap the caller
+                # doesn't know about.
+                #
+                # If ``limit`` is present the caller understands the
+                # paged contract: we clamp to ``_LIST_DEVICES_MAX_LIMIT``
+                # to bound reply size, return ``next_offset`` and
+                # ``total_matched``, and expect the caller to loop.
+                #
+                # Review notes (do not re-litigate without reading):
+                # - Silently clamping legacy callers to the cap was
+                #   tried in 2349130 and reverted in 79247e6: a partial
+                #   reply with no ``next_offset`` signal is worse than
+                #   a loud failure because the caller acts on a
+                #   truncated fleet as if it were complete. Operators
+                #   hitting the ceiling must upgrade the client.
+                # - Streaming replies (multi-message paginated stream)
+                #   were considered and rejected in the PR design:
+                #   adds reassembly complexity for every caller while
+                #   buying nothing the page loop doesn't already get.
+                # - ``limit <= 0`` returns -32602 rather than mapping
+                #   to the cap (was a review-round-3 fix); the
+                #   surprise mapping masked client bugs that passed
+                #   unintentional zero/negative values.
+                requested_limit = params.get("limit")
+                paged = requested_limit is not None
+                # ``offset`` is validated up front so a malformed value
+                # (e.g. ``"abc"``) produces a clean JSON-RPC error
+                # rather than a 500 from int().
+                raw_offset = params.get("offset", 0)
+                try:
+                    offset_val = int(raw_offset or 0)
+                except (TypeError, ValueError):
+                    await messaging.publish(
+                        reply,
+                        build_rpc_error(
+                            payload.get("id"), -32602,
+                            f"offset must be an integer, got {raw_offset!r}",
+                        ),
+                    )
+                    return
+                if offset_val < 0:
+                    await messaging.publish(
+                        reply,
+                        build_rpc_error(
+                            payload.get("id"), -32602,
+                            f"offset must be non-negative, got {offset_val}",
+                        ),
+                    )
+                    return
+
+                if paged:
+                    try:
+                        requested_limit_int = int(requested_limit)
+                    except (TypeError, ValueError):
+                        await messaging.publish(
+                            reply,
+                            build_rpc_error(
+                                payload.get("id"), -32602,
+                                f"limit must be an integer, got {requested_limit!r}",
+                            ),
+                        )
+                        return
+                    # Reject ``limit <= 0`` rather than silently mapping
+                    # it to the server cap. Mapping was surprising
+                    # ("limit=0" usually means "no rows" elsewhere) and
+                    # masked client bugs that passed unintentional zero
+                    # / negative values.
+                    if requested_limit_int <= 0:
+                        await messaging.publish(
+                            reply,
+                            build_rpc_error(
+                                payload.get("id"), -32602,
+                                f"limit must be positive, got {requested_limit_int}",
+                            ),
+                        )
+                        return
+                    effective_limit = min(
+                        requested_limit_int, _LIST_DEVICES_MAX_LIMIT,
+                    )
+                    # Surface silent clamps. The wire contract handles
+                    # the over-cap request correctly via ``next_offset``
+                    # — the caller just paginates more aggressively than
+                    # it asked for — but without a log line operators
+                    # tuning ``DEVICE_CONNECT_LIST_PAGE_SIZE`` above the
+                    # server cap have no signal that their knob is being
+                    # ignored. Deduped per process per requested value
+                    # so a steady-state misconfigured client logs once,
+                    # not once per page.
+                    if (
+                        effective_limit < requested_limit_int
+                        and requested_limit_int not in _WARNED_LIMIT_CLAMPS
+                    ):
+                        _WARNED_LIMIT_CLAMPS.add(requested_limit_int)
+                        logger.warning(
+                            "discovery/listDevices: caller requested limit=%d, "
+                            "clamped to server cap _LIST_DEVICES_MAX_LIMIT=%d. "
+                            "Caller will paginate more aggressively than "
+                            "expected. Raise DC_LIST_DEVICES_MAX_LIMIT (and "
+                            "NATS max_payload) if the smaller page is "
+                            "unintended.",
+                            requested_limit_int, _LIST_DEVICES_MAX_LIMIT,
+                        )
+                    page, next_offset, total = await asyncio.to_thread(
+                        registry.list_devices_page, tenant,
+                        device_type=device_type,
+                        location=location,
+                        offset=offset_val,
+                        limit=effective_limit,
+                    )
+                else:
+                    page = await asyncio.to_thread(
+                        registry.list_devices, tenant,
+                        device_type=device_type, location=location,
+                    )
+                    # next_offset / total are unused on the legacy reply
+                    # path (see the ``if paged`` branch below); the
+                    # legacy shape is just ``{"devices": page}``.
+
                 if acl_manager:
                     requester_id = params.get("requester_id", "")
-                    devs = acl_manager.filter_visible_devices(
-                        requester_id, devs, tenant=tenant
+                    # ACL filtering runs after pagination — devices the
+                    # caller is not allowed to see are dropped from the
+                    # page rather than from the unsliced fleet, so
+                    # ``total_matched`` may be larger than what
+                    # eventually reaches the requester and successive
+                    # pages may be shorter than ``limit``. That's
+                    # acceptable: ACL is opt-in and primarily a
+                    # server-side hint, not a strict cardinality
+                    # contract. Callers should not assume
+                    # ``len(devices) == limit`` even mid-fleet.
+                    page = acl_manager.filter_visible_devices(
+                        requester_id, page, tenant=tenant
                     )
+
+                if paged:
+                    response_result = {
+                        "devices": page,
+                        "next_offset": next_offset,
+                        "total_matched": total,
+                    }
+                else:
+                    # Legacy reply shape: just ``devices``. Don't emit
+                    # ``next_offset`` so old clients that ignore unknown
+                    # keys aren't surprised by new metadata.
+                    response_result = {"devices": page}
                 await messaging.publish(
                     reply,
-                    build_rpc_response(payload.get("id"), {"devices": devs})
+                    build_rpc_response(payload.get("id"), response_result),
                 )
             elif method == "discovery/getDevice":
                 device_id = params.get("device_id")
diff --git a/packages/device-connect-server/device_connect_server/registry/service/registry.py b/packages/device-connect-server/device_connect_server/registry/service/registry.py
index ff759d7..35db715 100644
--- a/packages/device-connect-server/device_connect_server/registry/service/registry.py
+++ b/packages/device-connect-server/device_connect_server/registry/service/registry.py
@@ -20,15 +20,49 @@
 import logging
 import os
 from dataclasses import dataclass, field
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Tuple
 
 import etcd3gw
+from requests.adapters import HTTPAdapter
 
 _logger = logging.getLogger(__name__)
 
 ETCD_HOST = os.getenv("ETCD_HOST", "localhost")
 ETCD_PORT = int(os.getenv("ETCD_PORT", "2379"))
 
+# Size of the urllib3 connection pool the etcd3gw client uses. The
+# default (10) caps concurrent HTTP-to-etcd round-trips and bottlenecks
+# the registry under a registration herd — every lease+put is two
+# sequential HTTP calls, so 1400 phones at startup queue thousands of
+# requests behind 10 sockets. 64 keeps the registry processor-bound on
+# realistic hardware while staying well under etcd's connection ceiling.
+_ETCD_POOL_SIZE = int(os.getenv("DC_ETCD_POOL_SIZE", "64"))
+
+
+def _enlarge_etcd_pool(client: Any, pool_size: int) -> None:
+    """Replace the etcd3gw client's HTTPAdapters with larger-pool ones.
+
+    We mount the adapter onto the already-constructed ``client.session``
+    instead of passing ``session=`` to ``etcd3gw.client(...)`` so the
+    fix works against etcd3gw 2.5.x (no ``session`` kwarg) and 2.6+.
+
+    If the etcd3gw client stops exposing ``session`` (e.g. a future
+    refactor wraps it), log a warning so the silently-degraded pool
+    doesn't reintroduce the registration-storm bottleneck without any
+    operator-visible signal.
+    """
+    if not hasattr(client, "session"):
+        _logger.warning(
+            "etcd3gw client has no ``session`` attribute; HTTP pool "
+            "size remains at the urllib3 default (10). This will "
+            "bottleneck the registry under registration herds. "
+            "Inspect etcd3gw internals and update _enlarge_etcd_pool.",
+        )
+        return
+    adapter = HTTPAdapter(pool_connections=pool_size, pool_maxsize=pool_size)
+    client.session.mount("http://", adapter)
+    client.session.mount("https://", adapter)
+
 
 def _kv_key(kv: dict) -> str:
     """Extract the key string from an etcd3gw KV metadata dict."""
@@ -89,6 +123,7 @@ class DeviceRegistry:
 
     def __post_init__(self) -> None:  # pragma: no cover - thin wrapper
         self.client = etcd3gw.client(host=self.host, port=self.port)
+        _enlarge_etcd_pool(self.client, _ETCD_POOL_SIZE)
 
     def _key(self, tenant: str, device_id: str) -> str:
         return f"/device-connect/{tenant}/devices/{device_id}"
@@ -171,6 +206,64 @@ def list_devices(
             ]
         return devices
 
+    def list_devices_page(
+        self,
+        tenant: str,
+        *,
+        device_type: str | None = None,
+        location: str | None = None,
+        offset: int = 0,
+        limit: int | None = None,
+    ) -> Tuple[List[dict], int | None, int]:
+        """Return one page of registered device payloads plus pagination metadata.
+
+        Slices the filtered fleet by ``offset`` and ``limit``. Existing etcd
+        load behavior is unchanged — we still scan the tenant prefix — but the
+        reply carries only the requested page, keeping NATS payloads bounded
+        regardless of fleet size.
+
+        Stability: device order follows etcd key order, which is deterministic
+        for a steady-state fleet; concurrent registrations/expirations can
+        shift records across pages, producing transient duplicates or skips
+        in a walk that spans many round-trips. Callers that need a perfectly
+        stable snapshot must filter duplicates by ``device_id`` after the
+        walk. (Keyset pagination would avoid this; offset is used here for
+        simplicity.)
+
+        Returns:
+            (devices_page, next_offset, total_matched).
+            ``next_offset`` is None when the page reaches the end of the
+            filtered list. ``total_matched`` is the size after the
+            ``device_type``/``location`` filters and before pagination.
+            ACL filtering, when enabled at the handler layer, runs after
+            this method returns and can further shrink the page.
+
+        Cost model (review notes — do not re-litigate without reading):
+            This call performs a full ``etcd get_prefix`` + JSON decode
+            per invocation. A walk over N devices in pages of P does
+            ``O(ceil(N / P))`` full etcd scans, i.e. registry CPU and
+            etcd traffic scale as ``O(N * walks_per_second)``. The fix
+            in this PR bounds NATS *payload* size (the original outage)
+            but does NOT reduce etcd or registry CPU load. The next
+            iteration is selector pushdown / keyset pagination
+            (documented as out-of-scope in PR #38 description). At the
+            current ~1400-device scale the per-walk cost is acceptable;
+            re-evaluate if walk rate or fleet size grows materially.
+        """
+        all_devices = self.list_devices(
+            tenant, device_type=device_type, location=location,
+        )
+        total = len(all_devices)
+        safe_offset = max(0, int(offset or 0))
+        if limit is None or limit <= 0:
+            page = all_devices[safe_offset:]
+            next_offset: int | None = None
+        else:
+            end = safe_offset + int(limit)
+            page = all_devices[safe_offset:end]
+            next_offset = end if end < total else None
+        return page, next_offset, total
+
     def get_device(self, tenant: str, device_id: str) -> dict | None:
         """Return a single device payload by direct key lookup (O(1)).
 
@@ -254,6 +347,24 @@ def list_devices(
     return _REGISTRY.list_devices(tenant, device_type=device_type, location=location)
 
 
+def list_devices_page(
+    tenant: str,
+    *,
+    device_type: str | None = None,
+    location: str | None = None,
+    offset: int = 0,
+    limit: int | None = None,
+) -> Tuple[List[dict], int | None, int]:
+    """Module-level wrapper for :meth:`DeviceRegistry.list_devices_page`."""
+    return _REGISTRY.list_devices_page(
+        tenant,
+        device_type=device_type,
+        location=location,
+        offset=offset,
+        limit=limit,
+    )
+
+
 def get_device(tenant: str, device_id: str) -> dict | None:
     """Return a single device by direct key lookup."""
     return _REGISTRY.get_device(tenant, device_id)
diff --git a/packages/device-connect-server/infra/docker-compose-nats-websocket.yml b/packages/device-connect-server/infra/docker-compose-nats-websocket.yml
new file mode 100644
index 0000000..46dba3c
--- /dev/null
+++ b/packages/device-connect-server/infra/docker-compose-nats-websocket.yml
@@ -0,0 +1,41 @@
+# Copyright (c) 2024-2026, Arm Limited and Contributors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Override: expose the NATS WebSocket listener to the host for browser-based
+# devices. Combine with the main multi-tenant compose file:
+#
+#   docker compose -f infra/docker-compose-multitenant-nats.yml \
+#                  -f infra/docker-compose-nats-websocket.yml up -d
+#
+# Prerequisite: the generated NATS config must include a `websocket {}` block.
+# Pass --enable-websocket to security_infra/setup_deployment.sh.
+#
+# IMPORTANT:
+#   The port is bound to 127.0.0.1 by default. The listener inside the
+#   container is plain WS (unless --websocket-tls-cert/--websocket-tls-key
+#   were passed to setup). A reverse proxy (Caddy, nginx, etc.) MUST do
+#   TLS termination on the host before the port is exposed to the network --
+#   without TLS, NATS JWTs travel in cleartext.
+#
+# Env vars (read at bring-up time, both default to 8443):
+#   DC_NATS_WS_PORT     The container-side port. MUST match the port that
+#                       setup_deployment.sh --websocket-port wrote into the
+#                       nats config. If they drift, NATS listens on one
+#                       port and the host maps a different one, and the
+#                       listener is silently unreachable.
+#   DC_NATS_WS_BIND     The host-side bind address + port. Defaults to
+#                       127.0.0.1:8443. Examples:
+#                         DC_NATS_WS_BIND=127.0.0.1:8443  # loopback only
+#                         DC_NATS_WS_BIND=10.0.0.5:8443   # LAN only
+#                       Never set it to 0.0.0.0 without TLS termination.
+#
+# Typical invocation (one-liner -- setup_deployment.sh prints this for you):
+#   DC_NATS_WS_PORT=8443 docker compose \
+#       -f infra/docker-compose-multitenant-nats.yml \
+#       -f infra/docker-compose-nats-websocket.yml up -d
+
+services:
+  nats:
+    ports:
+      - "${DC_NATS_WS_BIND:-127.0.0.1:8443}:${DC_NATS_WS_PORT:-8443}"
diff --git a/packages/device-connect-server/pyproject.toml b/packages/device-connect-server/pyproject.toml
index 84f685d..9195164 100644
--- a/packages/device-connect-server/pyproject.toml
+++ b/packages/device-connect-server/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "device-connect-server"
-version = "0.2.3"
+version = "0.2.4"
 description = "Device Connect — edge device runtime with Zenoh/NATS messaging, D2D communication, and IoT orchestration"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/packages/device-connect-server/security_infra/README.md b/packages/device-connect-server/security_infra/README.md
index 7608c0d..b652b65 100644
--- a/packages/device-connect-server/security_infra/README.md
+++ b/packages/device-connect-server/security_infra/README.md
@@ -25,6 +25,7 @@ Tools for setting up NATS JWT authentication and multi-tenant isolation for Devi
     - [2. Application-level tenant namespacing](#2-application-level-tenant-namespacing)
     - [What about Zenoh?](#what-about-zenoh)
   - [Connecting Devices](#connecting-devices)
+  - [Browser-based devices (WebSocket)](#browser-based-devices-websocket)
   - [Script Reference](#script-reference)
     - [gen\_creds.sh flags](#gen_credssh-flags)
     - [Environment variables](#environment-variables)
@@ -304,6 +305,63 @@ asyncio.run(main())
 
 The device will register in the `alpha` tenant, and only devices within `alpha` will discover it.
 
+## Browser-based devices (WebSocket)
+
+Browsers can't speak NATS over raw TCP — they need WebSocket. `setup_deployment.sh` has an opt-in `--enable-websocket` flag that adds a `websocket {}` block to the generated NATS config. The same operator/account JWT auth applies to WS clients; it's a transport, not a new auth path.
+
+```bash
+./setup_deployment.sh \
+    --nats-host dc.example.com \
+    --enable-websocket \
+    --websocket-port 8443 \
+    --websocket-allowed-origins https://lights.example.com
+```
+
+Then bring NATS up with the WebSocket compose override so the port is exposed on the host. The container-side port is read from `DC_NATS_WS_PORT` (default `8443`) and **must match the `--websocket-port` you passed to `setup_deployment.sh`** — they're written to different files and would otherwise drift silently:
+
+```bash
+DC_NATS_WS_PORT=8443 \
+  docker compose -f infra/docker-compose-multitenant-nats.yml \
+                 -f infra/docker-compose-nats-websocket.yml up -d
+```
+
+`setup_deployment.sh` prints the exact one-liner (with the value you passed) at the end of its run so you can copy-paste it.
+
+The override binds the port to `127.0.0.1:8443` by default. **Plain WS on the listener is intentional**: the assumption is that a reverse proxy (Caddy, nginx, Cloudflare Tunnel, ...) terminates TLS and proxies to loopback. Without TLS in front, NATS JWTs travel in cleartext.
+
+To skip the reverse proxy and have NATS do TLS termination natively, pass `--websocket-tls-cert` and `--websocket-tls-key` to `setup_deployment.sh`. The resulting block uses `tls { ... }` instead of `no_tls: true`, and you can safely expose the port directly (override `DC_NATS_WS_BIND=0.0.0.0:8443` in the env when running compose).
+
+### Reverse-proxy sketch (Caddy)
+
+```caddy
+app.example.com {
+    @nats path /nats /nats/*
+    reverse_proxy @nats 127.0.0.1:8443
+    reverse_proxy 127.0.0.1:8000      # your page / app
+}
+```
+
+The browser device library (`nats.ws` / `@nats-io/nats-core`) connects to `wss://app.example.com/nats` and authenticates with the JWT just like a TCP client would. From the broker's perspective every browser is an ordinary NATS client.
+
+### Subject scoping for browser credentials
+
+Browser JWTs are easier to exfiltrate than server-side ones — anyone who can open dev tools sees them. For shared-credential use cases (a swarm of read-only viewers, a kiosk, audience phones), narrow the credential's pub/sub scope with `nsc` before distributing it:
+
+```bash
+# Example: an audience credential scoped to a single subject subtree.
+# Device IDs must be dot-separated (NATS wildcards match whole tokens):
+#   device-connect.<tenant>.audience.<seat>.<id>.event.<name>
+nsc edit user audience-shared --account DEVICE_CONNECT \
+    --allow-pub 'device-connect.alpha.audience.>' \
+    --allow-pub 'device-connect.alpha.registry' \
+    --allow-pub '_INBOX.>' \
+    --allow-sub 'device-connect.alpha.audience.>' \
+    --allow-sub 'device-connect.alpha.broadcast' \
+    --allow-sub '_INBOX.>'
+```
+
+A leaked credential can then only reach the audience subtree, never cameras, robots, or the orchestrator.
+
 ## Script Reference
 
 | Script | Purpose |
diff --git a/packages/device-connect-server/security_infra/manage_tenants.sh b/packages/device-connect-server/security_infra/manage_tenants.sh
index 23b9d6f..44a0a0b 100755
--- a/packages/device-connect-server/security_infra/manage_tenants.sh
+++ b/packages/device-connect-server/security_infra/manage_tenants.sh
@@ -60,13 +60,28 @@ usage() {
 
 regenerate_nats_config() {
   local output="${SCRIPT_DIR}/nats-jwt-generated.conf"
+  # Preserve any previously-appended server directives (listen, http_port,
+  # max_payload, the websocket {} block, native tls {}, ...) across this
+  # regeneration. `nsc generate config` rewrites the file from scratch, so
+  # without this it silently drops everything below the marker -- which has
+  # taken the browser WebSocket listener (added via setup_deployment.sh
+  # --enable-websocket) offline in production every time a tenant or device
+  # was added. All appended directives live below the marker line.
+  local additions=""
+  if [ -f "${output}" ] && grep -q '^# Device Connect additions' "${output}"; then
+    additions=$(sed -n '/^# Device Connect additions/,$p' "${output}")
+  fi
   nsc generate config --mem-resolver --config-file "${output}" 2>/dev/null
-  cat >> "${output}" <<EOF
+  if [ -n "${additions}" ]; then
+    printf '\n%s\n' "${additions}" >> "${output}"
+  else
+    cat >> "${output}" <<EOF
 
 # Device Connect additions
 listen: 0.0.0.0:4222
 http_port: 8222
 EOF
+  fi
   echo "    NATS config regenerated: ${output}"
 }
 
diff --git a/packages/device-connect-server/security_infra/setup_deployment.sh b/packages/device-connect-server/security_infra/setup_deployment.sh
index c93d058..c0bad5f 100755
--- a/packages/device-connect-server/security_infra/setup_deployment.sh
+++ b/packages/device-connect-server/security_infra/setup_deployment.sh
@@ -22,33 +22,99 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 NATS_HOST=""
 NATS_PORT="4222"
+ENABLE_WEBSOCKET=0
+WS_PORT="8443"
+WS_ALLOWED_ORIGINS=""
+WS_TLS_CERT=""
+WS_TLS_KEY=""
 
 usage() {
-  echo "Usage: $0 --nats-host HOST [--nats-port PORT]"
-  echo ""
-  echo "One-time bootstrap for multi-tenant NATS JWT infrastructure."
-  echo "Creates the NATS operator/account and privileged credentials"
-  echo "(registry, facilitator)."
-  echo ""
-  echo "Options:"
-  echo "  --nats-host HOST   Public hostname or IP of the NATS server (required)"
-  echo "  --nats-port PORT   NATS port (default: 4222)"
-  echo "  -h, --help         Show this help"
-  echo ""
-  echo "After running this, use manage_tenants.sh to create tenants."
+  cat <<'USAGE'
+Usage: setup_deployment.sh --nats-host HOST [options]
+
+One-time bootstrap for multi-tenant NATS JWT infrastructure. Creates the
+NATS operator/account and privileged credentials (registry, facilitator).
+
+Required:
+  --nats-host HOST                Public hostname or IP of the NATS server.
+
+Optional:
+  --nats-port PORT                NATS TCP port (default: 4222).
+
+Browser-based devices (WebSocket):
+  --enable-websocket              Add a `websocket {}` block to the generated
+                                  NATS config so browser-based devices can
+                                  connect over WS (nats.ws / @nats-io/nats-core).
+                                  OFF by default; existing deployments are
+                                  unaffected.
+  --websocket-port PORT           WS listen port inside the container (default: 8443).
+  --websocket-allowed-origins LIST
+                                  Comma-separated list of allowed Origin headers.
+                                  Defaults to empty, which keeps nats-server's
+                                  same_origin=true behavior. Set this only when
+                                  a reverse proxy rewrites Host headers (e.g.
+                                  the page is at https://app.example.com and
+                                  the WS endpoint is wss://app.example.com/nats
+                                  proxied to a local NATS).
+  --websocket-tls-cert FILE       Native TLS cert (path inside the NATS
+  --websocket-tls-key FILE        container). When both are set, NATS does
+                                  TLS termination itself; otherwise the
+                                  listener is plain WS and MUST be fronted by
+                                  TLS (Caddy / nginx) before public exposure.
+
+After running this, use manage_tenants.sh to create tenants.
+
+Security notes for --enable-websocket:
+  * The compose port for the WS listener is provided via
+    infra/docker-compose-nats-websocket.yml (loopback-bound by default).
+    Combine it with the main compose file. The container-side WS port is
+    read from DC_NATS_WS_PORT (default 8443) and MUST match
+    --websocket-port -- otherwise NATS listens on one port and compose
+    maps a different one, silently leaving the listener unreachable.
+        DC_NATS_WS_PORT=8443 docker compose \
+            -f infra/docker-compose-multitenant-nats.yml \
+            -f infra/docker-compose-nats-websocket.yml up -d
+    (The "WebSocket listener enabled" message at the end of this script
+    prints the exact invocation including the value you passed.)
+  * Do NOT change the loopback binding without putting TLS in front; without
+    TLS, NATS JWTs travel in cleartext.
+USAGE
   exit 1
 }
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
-    --nats-host)  NATS_HOST="$2"; shift 2 ;;
-    --nats-port)  NATS_PORT="$2"; shift 2 ;;
-    -h|--help)    usage ;;
-    *)            echo "Unknown option: $1"; usage ;;
+    --nats-host)                  NATS_HOST="$2"; shift 2 ;;
+    --nats-port)                  NATS_PORT="$2"; shift 2 ;;
+    --enable-websocket)           ENABLE_WEBSOCKET=1; shift ;;
+    --websocket-port)             WS_PORT="$2"; shift 2 ;;
+    --websocket-allowed-origins)  WS_ALLOWED_ORIGINS="$2"; shift 2 ;;
+    --websocket-tls-cert)         WS_TLS_CERT="$2"; shift 2 ;;
+    --websocket-tls-key)          WS_TLS_KEY="$2"; shift 2 ;;
+    -h|--help)                    usage ;;
+    *)                            echo "Unknown option: $1"; usage ;;
   esac
 done
 
+# TLS pair: both or neither.
+if { [ -n "$WS_TLS_CERT" ] && [ -z "$WS_TLS_KEY" ]; } || \
+   { [ -z "$WS_TLS_CERT" ] && [ -n "$WS_TLS_KEY" ]; }; then
+  echo "Error: --websocket-tls-cert and --websocket-tls-key must be used together."
+  exit 1
+fi
+
+# Port arguments must be numeric -- a typo like `--websocket-port 84as3`
+# would otherwise flow into the generated config and only fail later.
+if ! [[ "$NATS_PORT" =~ ^[0-9]+$ ]]; then
+  echo "Error: --nats-port must be numeric (got: ${NATS_PORT})."
+  exit 1
+fi
+if [ "$ENABLE_WEBSOCKET" -eq 1 ] && ! [[ "$WS_PORT" =~ ^[0-9]+$ ]]; then
+  echo "Error: --websocket-port must be numeric (got: ${WS_PORT})."
+  exit 1
+fi
+
 if [ -z "$NATS_HOST" ]; then
   echo "Error: --nats-host is required"
   echo ""
@@ -107,8 +173,71 @@ cat >> "${OUTPUT_CONF}" <<EOF
 # Device Connect additions
 listen: 0.0.0.0:4222
 http_port: 8222
+# Raised from the 1MB default so the registry can return fleet snapshots
+# for large deployments (~1400 devices at ~6KB/record = ~8MB).
+max_payload: 8MB
 EOF
 
+# Optional: WebSocket listener for browser-based devices.
+# Operator-mode JWT auth applies identically to WS and TCP clients; this
+# block adds a transport, not a new auth path.
+if [ "$ENABLE_WEBSOCKET" -eq 1 ]; then
+  {
+    echo ""
+    echo "# WebSocket listener (added by --enable-websocket)."
+    echo "# Browsers reach NATS via this listener; same JWT auth as TCP."
+    echo "websocket {"
+    echo "  port: ${WS_PORT}"
+    if [ -n "$WS_TLS_CERT" ] && [ -n "$WS_TLS_KEY" ]; then
+      echo "  tls {"
+      echo "    cert_file: \"${WS_TLS_CERT}\""
+      echo "    key_file:  \"${WS_TLS_KEY}\""
+      echo "  }"
+    else
+      echo "  # Plain WS. The compose override binds this to 127.0.0.1 only;"
+      echo "  # a reverse proxy (Caddy/nginx) MUST terminate TLS before this"
+      echo "  # port is exposed to the network."
+      echo "  no_tls: true"
+    fi
+    if [ -n "$WS_ALLOWED_ORIGINS" ]; then
+      # Trim each token and skip empties so "a.com,,b.com" or a trailing
+      # comma doesn't produce a stray "" entry in allowed_origins.
+      origins_json=$(echo "$WS_ALLOWED_ORIGINS" | awk -F, '{
+        out=""; first=1;
+        for (i=1; i<=NF; i++) {
+          tok = $i;
+          gsub(/^[ \t]+|[ \t]+$/, "", tok);
+          if (tok == "") continue;
+          out = out (first ? "" : ", ") "\"" tok "\"";
+          first = 0;
+        }
+        print out
+      }')
+      if [ -n "$origins_json" ]; then
+        echo "  allowed_origins: [${origins_json}]"
+      fi
+    fi
+    echo "  compression: true"
+    echo "}"
+  } >> "${OUTPUT_CONF}"
+  echo ""
+  echo "==> WebSocket listener enabled on port ${WS_PORT}"
+  echo ""
+  echo "    Bring up the compose stack with BOTH files and pass the WS port"
+  echo "    as DC_NATS_WS_PORT so the host->container mapping matches the"
+  echo "    listener port written into the config (they live in different"
+  echo "    files and would otherwise drift silently):"
+  echo ""
+  echo "      DC_NATS_WS_PORT=${WS_PORT} \\"
+  echo "        docker compose \\"
+  echo "          -f infra/docker-compose-multitenant-nats.yml \\"
+  echo "          -f infra/docker-compose-nats-websocket.yml up -d"
+  echo ""
+  echo "    To bind the host port somewhere other than 127.0.0.1, also set"
+  echo "    DC_NATS_WS_BIND=10.0.0.5:${WS_PORT} (LAN only, never 0.0.0.0"
+  echo "    without TLS termination in front)."
+fi
+
 echo ""
 echo "============================================"
 echo "  Deployment infrastructure ready!"
diff --git a/packages/device-connect-server/tests/device_connect_server/test_nats_rpc.py b/packages/device-connect-server/tests/device_connect_server/test_nats_rpc.py
new file mode 100644
index 0000000..3013454
--- /dev/null
+++ b/packages/device-connect-server/tests/device_connect_server/test_nats_rpc.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2024-2026, Arm Limited and Contributors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for the portal NATS RPC helper.
+
+Focused on the cached-client behavior introduced when the portal stopped
+opening a fresh connection per invoke():
+
+- a single cached client is reused across calls
+- transport errors drop the cached client so the next call reconnects
+- non-transport errors (payload bugs, ProtocolError, ...) do NOT churn
+  the cached connection
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import nats.errors
+import pytest
+
+from device_connect_server.portal.services import nats_rpc
+
+
+@pytest.fixture(autouse=True)
+def _reset_module_state():
+    """Reset the module-level cached client + lock between tests.
+
+    The cached client is module state — without this, tests would leak
+    fakes into each other and the lock would bind to a stale event loop.
+    """
+    nats_rpc._invoke_client = None
+    nats_rpc._invoke_client_lock = None
+    yield
+    nats_rpc._invoke_client = None
+    nats_rpc._invoke_client_lock = None
+
+
+def _make_fake_nc(*, request_reply: bytes | None = None, request_exc: Exception | None = None):
+    """Build a fake nats client that returns request_reply or raises request_exc."""
+    nc = MagicMock()
+    nc.is_closed = False
+    if request_exc is not None:
+        nc.request = AsyncMock(side_effect=request_exc)
+    else:
+        msg = MagicMock()
+        msg.data = request_reply or b'{"jsonrpc":"2.0","id":"x","result":{}}'
+        nc.request = AsyncMock(return_value=msg)
+    nc.close = AsyncMock()
+    return nc
+
+
+class TestInvokeClientReuse:
+
+    @pytest.mark.asyncio
+    async def test_client_is_reused_across_calls(self):
+        """Two invoke() calls share the same underlying NATS client."""
+        fake_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"1","result":{"ok":true}}',
+        )
+
+        with patch.object(nats_rpc, "connect", AsyncMock(return_value=fake_nc)) as connect_mock:
+            r1 = await nats_rpc.invoke("t", "dev-1", "fn", {})
+            r2 = await nats_rpc.invoke("t", "dev-2", "fn", {})
+
+        # Single connect() call shared across both invokes.
+        assert connect_mock.await_count == 1
+        # Both requests went through the same client.
+        assert fake_nc.request.await_count == 2
+        assert r1["result"]["ok"] is True
+        assert r2["result"]["ok"] is True
+
+    @pytest.mark.asyncio
+    async def test_client_is_reconnected_if_closed(self):
+        """A cached-but-closed client triggers a fresh connect on next call."""
+        first_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"1","result":{}}',
+        )
+        second_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"2","result":{}}',
+        )
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(side_effect=[first_nc, second_nc]),
+        ) as connect_mock:
+            await nats_rpc.invoke("t", "dev-1", "fn", {})
+            # Simulate the broker closing the connection out from under us.
+            first_nc.is_closed = True
+            await nats_rpc.invoke("t", "dev-2", "fn", {})
+
+        assert connect_mock.await_count == 2
+
+
+class TestInvokeClientDropOnTransportError:
+
+    @pytest.mark.asyncio
+    async def test_transport_error_drops_cached_client(self):
+        """ConnectionClosedError forces a reconnect on the next call."""
+        bad_nc = _make_fake_nc(
+            request_exc=nats.errors.ConnectionClosedError(),
+        )
+        good_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"2","result":{"ok":true}}',
+        )
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(side_effect=[bad_nc, good_nc]),
+        ) as connect_mock:
+            err = await nats_rpc.invoke("t", "dev-1", "fn", {})
+            ok = await nats_rpc.invoke("t", "dev-2", "fn", {})
+
+        assert err["error"]["code"] == -3
+        assert ok["result"]["ok"] is True
+        # First connect for bad_nc, second connect after drop.
+        assert connect_mock.await_count == 2
+        # The bad client was best-effort closed when dropped.
+        bad_nc.close.assert_awaited_once()
+
+    @pytest.mark.asyncio
+    async def test_stale_connection_error_drops_cached_client(self):
+        """StaleConnectionError is also treated as transport-fatal."""
+        bad_nc = _make_fake_nc(
+            request_exc=nats.errors.StaleConnectionError(),
+        )
+        good_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"2","result":{}}',
+        )
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(side_effect=[bad_nc, good_nc]),
+        ) as connect_mock:
+            await nats_rpc.invoke("t", "dev-1", "fn", {})
+            await nats_rpc.invoke("t", "dev-2", "fn", {})
+
+        assert connect_mock.await_count == 2
+
+    @pytest.mark.asyncio
+    async def test_os_error_drops_cached_client(self):
+        """Raw OSError (socket-level) is treated as transport-fatal too."""
+        bad_nc = _make_fake_nc(
+            request_exc=OSError("socket reset"),
+        )
+        good_nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"2","result":{}}',
+        )
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(side_effect=[bad_nc, good_nc]),
+        ) as connect_mock:
+            await nats_rpc.invoke("t", "dev-1", "fn", {})
+            await nats_rpc.invoke("t", "dev-2", "fn", {})
+
+        assert connect_mock.await_count == 2
+
+
+class TestInvokeClientKeptOnNonTransportError:
+
+    @pytest.mark.asyncio
+    async def test_protocol_error_keeps_cached_client(self):
+        """ProtocolError is a payload bug, not a connection death.
+
+        Dropping the client on every payload error would churn the
+        connection — exactly the regression we wanted to avoid by
+        narrowing the exception handler. The cached client must survive.
+        """
+        nc = MagicMock()
+        nc.is_closed = False
+        good_reply = MagicMock()
+        good_reply.data = b'{"jsonrpc":"2.0","id":"2","result":{"ok":true}}'
+        nc.request = AsyncMock(
+            side_effect=[nats.errors.ProtocolError(), good_reply],
+        )
+        nc.close = AsyncMock()
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(return_value=nc),
+        ) as connect_mock:
+            err = await nats_rpc.invoke("t", "dev-1", "fn", {})
+            ok = await nats_rpc.invoke("t", "dev-1", "fn", {})
+
+        assert err["error"]["code"] == -3
+        assert ok["result"]["ok"] is True
+        # Single connect, no drop: the cached client was reused across
+        # the protocol error.
+        assert connect_mock.await_count == 1
+        nc.close.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_no_responders_keeps_cached_client(self):
+        """NoRespondersError already has its own branch and never drops."""
+        nc = MagicMock()
+        nc.is_closed = False
+        good_reply = MagicMock()
+        good_reply.data = b'{"jsonrpc":"2.0","id":"2","result":{"ok":true}}'
+        nc.request = AsyncMock(
+            side_effect=[nats.errors.NoRespondersError(), good_reply],
+        )
+        nc.close = AsyncMock()
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(return_value=nc),
+        ) as connect_mock:
+            no_resp = await nats_rpc.invoke("t", "dev-1", "fn", {})
+            ok = await nats_rpc.invoke("t", "dev-1", "fn", {})
+
+        assert no_resp["error"]["code"] == -1
+        assert ok["result"]["ok"] is True
+        assert connect_mock.await_count == 1
+        nc.close.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_connection_reconnecting_keeps_cached_client(self):
+        """ConnectionReconnectingError means the client is *already*
+        reconnecting itself. Dropping the cached client in that state
+        would preempt nats-py's own reconnect machinery and amplify
+        broker flaps. The cached client must survive."""
+        nc = MagicMock()
+        nc.is_closed = False
+        good_reply = MagicMock()
+        good_reply.data = b'{"jsonrpc":"2.0","id":"2","result":{"ok":true}}'
+        nc.request = AsyncMock(
+            side_effect=[nats.errors.ConnectionReconnectingError(), good_reply],
+        )
+        nc.close = AsyncMock()
+
+        with patch.object(
+            nats_rpc, "connect", AsyncMock(return_value=nc),
+        ) as connect_mock:
+            reconnecting = await nats_rpc.invoke("t", "dev-1", "fn", {})
+            ok = await nats_rpc.invoke("t", "dev-1", "fn", {})
+
+        assert reconnecting["error"]["code"] == -3
+        assert ok["result"]["ok"] is True
+        # Single connect, no drop. The same cached client is reused
+        # once nats-py finishes its own internal reconnect.
+        assert connect_mock.await_count == 1
+        nc.close.assert_not_called()
+
+
+class TestCloseInvokeClient:
+    """``close_invoke_client`` is wired into ``aiohttp on_cleanup`` so the
+    long-lived socket is released at graceful shutdown rather than
+    leaking until interpreter exit."""
+
+    @pytest.mark.asyncio
+    async def test_close_invoke_client_closes_cached(self):
+        nc = _make_fake_nc(
+            request_reply=b'{"jsonrpc":"2.0","id":"1","result":{}}',
+        )
+        with patch.object(nats_rpc, "connect", AsyncMock(return_value=nc)):
+            await nats_rpc.invoke("t", "dev-1", "fn", {})
+            assert nats_rpc._invoke_client is nc
+            await nats_rpc.close_invoke_client()
+
+        nc.close.assert_awaited_once()
+        assert nats_rpc._invoke_client is None
+
+    @pytest.mark.asyncio
+    async def test_close_invoke_client_idempotent(self):
+        """Calling close twice (or before any invoke) must be a no-op."""
+        # No client cached yet — must not raise.
+        await nats_rpc.close_invoke_client()
+        assert nats_rpc._invoke_client is None
+        # And a second call is still safe.
+        await nats_rpc.close_invoke_client()
diff --git a/packages/device-connect-server/tests/device_connect_server/test_portal_agent_api.py b/packages/device-connect-server/tests/device_connect_server/test_portal_agent_api.py
index 287d299..d2a0d28 100644
--- a/packages/device-connect-server/tests/device_connect_server/test_portal_agent_api.py
+++ b/packages/device-connect-server/tests/device_connect_server/test_portal_agent_api.py
@@ -451,6 +451,174 @@ async def rpc_invoke(self, tenant, full_name, fn, params, timeout):
         assert seen["timeout"] == agent_api.MAX_INVOKE_TIMEOUT_S
 
 
+# ── revoke endpoint failure modes (regression) ────────────────────
+
+
+@pytest.fixture
+def provision_record():
+    return {
+        "token_id": "prov0",
+        "username": "alice",
+        "tenant": "acme",
+        "role": "user",
+        "scopes": ["devices:provision"],
+        "created_at": "2026-05-01T00:00:00+00:00",
+    }
+
+
+@pytest.fixture
+async def provision_client(provision_record):
+    app = _build_app()
+    server = TestServer(app)
+    async with server:
+        async with TestClient(server) as cli:
+            with patch.object(tokens_svc, "verify_token", return_value=provision_record):
+                yield cli
+
+
+class _RevokeBackend:
+    """Minimal stand-in for the credentials backend in revoke tests.
+
+    - ``supports_remove=False`` simulates a backend with no
+      ``remove_device`` attribute (e.g. plain MQTT). The revoke
+      handler must treat this as a soft success and still drop the
+      local file.
+    - ``raise_on_remove`` simulates a hard backend failure (the broker
+      rejected the revoke). The handler must NOT delete the local
+      file in that case, so the operator can retry once the backend
+      is healthy.
+    """
+
+    def __init__(self, *, supports_remove: bool = True,
+                 raise_on_remove: Exception | None = None):
+        self._supports = supports_remove
+        self._raise = raise_on_remove
+        self.removed: list[tuple[str, str]] = []
+        self.reloaded = 0
+        if supports_remove:
+            async def _remove(tenant: str, full_name: str) -> None:
+                if self._raise is not None:
+                    raise self._raise
+                self.removed.append((tenant, full_name))
+            self.remove_device = _remove
+
+    def backend_name(self) -> str:
+        return "test"
+
+    async def reload_broker(self) -> None:
+        self.reloaded += 1
+
+
+class TestDeviceRevoke:
+    async def test_hard_backend_failure_keeps_file(self, provision_client):
+        """If remove_device raises, the local cred file must remain in place.
+
+        The old behavior deleted the file regardless, leaving a
+        ghost-file-pointing-at-still-valid-account state with no
+        operator-visible signal. Now the handler returns 502 and the
+        delete_credential helper is never called.
+        """
+        backend = _RevokeBackend(raise_on_remove=RuntimeError("nats unreachable"))
+        deleted = []
+
+        def _delete(filename):
+            deleted.append(filename)
+            return True
+
+        with patch(
+            "device_connect_server.portal.views.agent_api.get_backend",
+            return_value=backend,
+        ), patch(
+            "device_connect_server.portal.views.agent_api.credentials_svc.delete_credential",
+            side_effect=_delete,
+        ):
+            r = await provision_client.post(
+                "/api/agent/v1/devices/cam-001/revoke", headers=H(),
+            )
+            assert r.status == 502
+            body = await r.json()
+            assert body["error"]["code"] == "backend_revoke_failed"
+            # The critical assertion: file is NOT deleted on backend failure.
+            assert deleted == []
+
+    async def test_unsupported_backend_is_soft_success(self, provision_client):
+        """A backend without remove_device must still delete the file.
+
+        Older / MQTT-style backends never had a remove primitive; the
+        only thing the operator can do is drop the file so the UI
+        reflects intent.
+        """
+        backend = _RevokeBackend(supports_remove=False)
+        deleted = []
+
+        def _delete(filename):
+            deleted.append(filename)
+            return True
+
+        with patch(
+            "device_connect_server.portal.views.agent_api.get_backend",
+            return_value=backend,
+        ), patch(
+            "device_connect_server.portal.views.agent_api.credentials_svc.delete_credential",
+            side_effect=_delete,
+        ):
+            r = await provision_client.post(
+                "/api/agent/v1/devices/cam-001/revoke", headers=H(),
+            )
+            assert r.status == 200
+            body = await r.json()
+            assert body["result"]["revoked"] is True
+            assert body["result"]["device_id"] == "acme-cam-001"
+            # The unsupported-backend warning must still surface so an
+            # operator who expected a real revoke knows it didn't happen.
+            assert "backend_warning" in body["result"]
+            assert deleted == ["acme-cam-001.creds.json"]
+
+    async def test_backend_succeeds_then_file_missing_explains_partial(
+        self, provision_client,
+    ):
+        """File-delete fails after backend revoke succeeded: 404 must
+        say the account is already revoked so the operator doesn't
+        think the whole operation failed."""
+        backend = _RevokeBackend()  # remove_device succeeds
+
+        with patch(
+            "device_connect_server.portal.views.agent_api.get_backend",
+            return_value=backend,
+        ), patch(
+            "device_connect_server.portal.views.agent_api.credentials_svc.delete_credential",
+            return_value=False,
+        ):
+            r = await provision_client.post(
+                "/api/agent/v1/devices/cam-001/revoke", headers=H(),
+            )
+            assert r.status == 404
+            body = await r.json()
+            # Be tolerant about exact wording, but the message must
+            # name the partial-success state explicitly.
+            msg = body["error"]["message"].lower()
+            assert "already revoked" in msg or "account" in msg
+
+    async def test_happy_path(self, provision_client):
+        backend = _RevokeBackend()
+        with patch(
+            "device_connect_server.portal.views.agent_api.get_backend",
+            return_value=backend,
+        ), patch(
+            "device_connect_server.portal.views.agent_api.credentials_svc.delete_credential",
+            return_value=True,
+        ):
+            r = await provision_client.post(
+                "/api/agent/v1/devices/cam-001/revoke", headers=H(),
+            )
+            assert r.status == 200
+            body = await r.json()
+            assert body["result"]["revoked"] is True
+            assert "backend_warning" not in body["result"]
+            assert backend.removed == [("acme", "acme-cam-001")]
+            assert backend.reloaded == 1
+
+
 # ── invoke-with-fallback duplicate device id (regression) ─────────
 
 
diff --git a/packages/device-connect-server/tests/device_connect_server/test_portal_auth_redirect.py b/packages/device-connect-server/tests/device_connect_server/test_portal_auth_redirect.py
new file mode 100644
index 0000000..b6508c9
--- /dev/null
+++ b/packages/device-connect-server/tests/device_connect_server/test_portal_auth_redirect.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024-2026, Arm Limited and Contributors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Browser-session auth redirect tests.
+
+After a portal restart the dashboard's 10s htmx poll on
+``/api/devices/live`` fires without a session cookie. The auth
+middleware used to capture that fragment URL as the post-login
+``next`` target, so re-logging in dropped the user onto the bare
+fragment instead of ``/dashboard``. These tests pin the fix: fragment
+endpoints (htmx requests, and anything under ``/api/``) must not become
+post-login destinations.
+"""
+
+from __future__ import annotations
+
+from urllib.parse import parse_qs, urlsplit
+
+from aiohttp import web
+from aiohttp.test_utils import TestClient, TestServer
+
+from device_connect_server.portal.app import auth_middleware
+from device_connect_server.portal.views.auth import _safe_next
+
+
+def _next_param(location: str) -> str | None:
+    """Extract the ``next`` query param from a Location header.
+
+    aiohttp normalizes percent-encoding in Location, so we compare the
+    decoded value rather than the raw header.
+    """
+    qs = parse_qs(urlsplit(location).query)
+    values = qs.get("next")
+    return values[0] if values else None
+
+
+# ── _safe_next ────────────────────────────────────────────────────
+
+
+class TestSafeNext:
+    def test_full_page_path_accepted(self):
+        assert _safe_next("/dashboard") == "/dashboard"
+        assert _safe_next("/cli/approval/xyz") == "/cli/approval/xyz"
+
+    def test_external_url_rejected(self):
+        assert _safe_next("https://evil.example.com/") is None
+
+    def test_protocol_relative_rejected(self):
+        assert _safe_next("//evil.example.com/path") is None
+
+    def test_crlf_rejected(self):
+        assert _safe_next("/dashboard\r\nSet-Cookie: evil=1") is None
+        assert _safe_next("/dashboard\nfoo") is None
+
+    def test_api_path_rejected(self):
+        """``/api/`` returns JSON/HTML fragments, not full pages. Even if
+        a stale or malicious link arrives with ``?next=/api/...``, login
+        must not land the user there."""
+        assert _safe_next("/api/devices/live?tenant=alpha") is None
+        assert _safe_next("/api/devices/cam-001/live-detail") is None
+        assert _safe_next("/api/admin/tenants-table") is None
+
+    def test_static_path_rejected(self):
+        assert _safe_next("/static/css/app.css") is None
+
+    def test_empty_or_none(self):
+        assert _safe_next(None) is None
+        assert _safe_next("") is None
+
+
+# ── auth_middleware redirect target ──────────────────────────────
+
+
+async def _stub(_request):
+    return web.Response(text="ok")
+
+
+def _build_app() -> web.Application:
+    app = web.Application(middlewares=[auth_middleware])
+    # Routes that the auth middleware will gate. We don't actually
+    # reach the handler since the middleware short-circuits with a
+    # redirect when there's no session cookie.
+    app.router.add_get("/dashboard", _stub)
+    app.router.add_get("/api/devices/live", _stub)
+    app.router.add_get("/cli/approval/{token}", _stub)
+    return app
+
+
+class TestAuthMiddlewareRedirect:
+    """The unauthenticated browser-session path."""
+
+    async def test_htmx_poll_does_not_capture_fragment_as_next(self):
+        """The original repro: an htmx poll on a fragment endpoint
+        triggers an HX-Redirect to /login *without* ``?next=`` so the
+        post-login destination falls back to /dashboard."""
+        app = _build_app()
+        async with TestServer(app) as server:
+            async with TestClient(server) as cli:
+                r = await cli.get(
+                    "/api/devices/live?tenant=alpha",
+                    headers={"HX-Request": "true"},
+                    allow_redirects=False,
+                )
+                assert r.status == 200
+                redirect = r.headers.get("HX-Redirect", "")
+                assert redirect == "/login", (
+                    f"htmx fragment poll must redirect to /login with no "
+                    f"``next`` capture; got {redirect!r}"
+                )
+
+    async def test_api_request_does_not_capture_next(self):
+        """Even a non-htmx GET on an /api/ path (e.g. a script or stale
+        link) must not become a post-login destination."""
+        app = _build_app()
+        async with TestServer(app) as server:
+            async with TestClient(server) as cli:
+                r = await cli.get(
+                    "/api/devices/live?tenant=alpha",
+                    allow_redirects=False,
+                )
+                assert r.status == 302
+                assert r.headers["Location"] == "/login"
+
+    async def test_top_level_navigation_captures_next(self):
+        """Top-level HTML navigation to an authenticated page still
+        gets a ``next`` capture so the CLI-approval flow works."""
+        app = _build_app()
+        async with TestServer(app) as server:
+            async with TestClient(server) as cli:
+                r = await cli.get(
+                    "/cli/approval/abc123",
+                    allow_redirects=False,
+                )
+                assert r.status == 302
+                location = r.headers["Location"]
+                assert location.startswith("/login?next=")
+                assert _next_param(location) == "/cli/approval/abc123"
+
+    async def test_top_level_dashboard_captures_next(self):
+        """Sanity check: regular page navigation still captures the
+        page URL as ``next`` so login lands the user back where they
+        were."""
+        app = _build_app()
+        async with TestServer(app) as server:
+            async with TestClient(server) as cli:
+                r = await cli.get("/dashboard", allow_redirects=False)
+                assert r.status == 302
+                location = r.headers["Location"]
+                assert location.startswith("/login?next=")
+                assert _next_param(location) == "/dashboard"
diff --git a/packages/device-connect-server/tests/device_connect_server/test_registry_service.py b/packages/device-connect-server/tests/device_connect_server/test_registry_service.py
index 4a91f0e..dd4df55 100644
--- a/packages/device-connect-server/tests/device_connect_server/test_registry_service.py
+++ b/packages/device-connect-server/tests/device_connect_server/test_registry_service.py
@@ -25,6 +25,7 @@
 
 from device_connect_server.registry.service.registry import (  # noqa: E402
     DeviceRegistry,
+    _enlarge_etcd_pool,
     _kv_key,
     register,
     refresh,
@@ -261,6 +262,139 @@ def test_list_devices_multi_tenant_isolation(self, mock_etcd3gw):
         assert calls[1] == call("/device-connect/tenant-b/devices/")
 
 
+# ---------------------------------------------------------------------------
+# TestListDevicesPage — pagination for large fleets
+# ---------------------------------------------------------------------------
+
+
+class TestListDevicesPage:
+    """Verify list_devices_page slices the filtered fleet and reports metadata."""
+
+    @staticmethod
+    def _mock_fleet(mock_client, n):
+        """Populate mock etcd with n devices ordered by device_id."""
+        mock_client.get_prefix.return_value = [
+            (
+                json.dumps({"device_id": f"dev-{i:04d}"}),
+                {"key": _b64(f"/device-connect/default/devices/dev-{i:04d}")},
+            )
+            for i in range(n)
+        ]
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_returns_slice_and_next_offset(self, mock_etcd3gw):
+        mock_client = _make_mock_etcd_client()
+        self._mock_fleet(mock_client, 350)
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        page, next_offset, total = reg.list_devices_page(
+            "default", offset=0, limit=100,
+        )
+
+        assert len(page) == 100
+        assert page[0]["device_id"] == "dev-0000"
+        assert page[-1]["device_id"] == "dev-0099"
+        assert next_offset == 100
+        assert total == 350
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_final_sets_next_offset_none(self, mock_etcd3gw):
+        mock_client = _make_mock_etcd_client()
+        self._mock_fleet(mock_client, 250)
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        page, next_offset, total = reg.list_devices_page(
+            "default", offset=200, limit=100,
+        )
+
+        assert len(page) == 50
+        assert page[0]["device_id"] == "dev-0200"
+        assert next_offset is None
+        assert total == 250
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_offset_past_end_returns_empty(self, mock_etcd3gw):
+        mock_client = _make_mock_etcd_client()
+        self._mock_fleet(mock_client, 10)
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        page, next_offset, total = reg.list_devices_page(
+            "default", offset=50, limit=10,
+        )
+
+        assert page == []
+        assert next_offset is None
+        assert total == 10
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_limit_none_returns_remaining(self, mock_etcd3gw):
+        mock_client = _make_mock_etcd_client()
+        self._mock_fleet(mock_client, 5)
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        page, next_offset, total = reg.list_devices_page(
+            "default", offset=0, limit=None,
+        )
+
+        assert len(page) == 5
+        assert next_offset is None
+        assert total == 5
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_walks_full_fleet(self, mock_etcd3gw):
+        """Looping with next_offset must reconstruct the full fleet."""
+        mock_client = _make_mock_etcd_client()
+        self._mock_fleet(mock_client, 1400)  # the actual blocker scale
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        gathered = []
+        offset = 0
+        while True:
+            page, next_offset, total = reg.list_devices_page(
+                "default", offset=offset, limit=100,
+            )
+            gathered.extend(page)
+            if next_offset is None:
+                break
+            offset = next_offset
+
+        assert total == 1400
+        assert len(gathered) == 1400
+        # Order must be stable across pages
+        assert [d["device_id"] for d in gathered] == [
+            f"dev-{i:04d}" for i in range(1400)
+        ]
+
+    @patch("device_connect_server.registry.service.registry.etcd3gw")
+    def test_page_respects_filter(self, mock_etcd3gw):
+        """device_type filter applies before pagination."""
+        mock_client = _make_mock_etcd_client()
+        mock_client.get_prefix.return_value = [
+            (json.dumps({"device_id": "cam-001", "identity": {"device_type": "camera"}}),
+             {"key": _b64("/d")}),
+            (json.dumps({"device_id": "arm-001", "identity": {"device_type": "robot"}}),
+             {"key": _b64("/d")}),
+            (json.dumps({"device_id": "cam-002", "identity": {"device_type": "camera"}}),
+             {"key": _b64("/d")}),
+        ]
+        mock_etcd3gw.client.return_value = mock_client
+
+        reg = DeviceRegistry(host="localhost", port=2379)
+        page, next_offset, total = reg.list_devices_page(
+            "default", device_type="camera", offset=0, limit=10,
+        )
+
+        assert total == 2
+        assert len(page) == 2
+        assert all(d["identity"]["device_type"] == "camera" for d in page)
+        assert next_offset is None
+
+
 # ---------------------------------------------------------------------------
 # TestGetDevice (via update_status reading pattern)
 # ---------------------------------------------------------------------------
@@ -559,3 +693,43 @@ def test_returns_raw_on_decode_failure(self):
     def test_returns_empty_when_key_missing(self):
         result = _kv_key({})
         assert result == ""
+
+
+# ---------------------------------------------------------------------------
+# TestEnlargeEtcdPool
+# ---------------------------------------------------------------------------
+
+
+class TestEnlargeEtcdPool:
+    """The fix mounts an oversized urllib3 HTTPAdapter onto the etcd3gw
+    client's underlying ``requests.Session`` so the registry doesn't
+    bottleneck on the default 10-socket pool under a registration herd.
+    The fallback path (etcd3gw stops exposing ``client.session``) must
+    warn loudly rather than silently regress to the old pool size."""
+
+    def test_mounts_adapter_on_existing_session(self):
+        session = MagicMock()
+        client = MagicMock()
+        client.session = session
+
+        _enlarge_etcd_pool(client, pool_size=64)
+
+        # Both http:// and https:// must be mounted so we don't leave a
+        # surviving small-pool adapter on either scheme.
+        scheme_args = [call.args[0] for call in session.mount.call_args_list]
+        assert "http://" in scheme_args
+        assert "https://" in scheme_args
+
+    def test_logs_warning_when_session_missing(self, caplog):
+        # ``spec=[]`` makes ``hasattr(client, "session")`` return False
+        # without raising — simulating a future etcd3gw refactor that
+        # renames or hides the session attribute.
+        client = MagicMock(spec=[])
+
+        with caplog.at_level("WARNING", logger="device_connect_server.registry.service.registry"):
+            _enlarge_etcd_pool(client, pool_size=64)
+
+        assert any(
+            "session" in rec.message and "urllib3 default" in rec.message
+            for rec in caplog.records
+        ), f"expected pool-fallback warning, got: {[r.message for r in caplog.records]}"
diff --git a/packages/device-connect-server/tests/device_connect_server/test_rpc_handlers.py b/packages/device-connect-server/tests/device_connect_server/test_rpc_handlers.py
index 1eed021..2976dc1 100644
--- a/packages/device-connect-server/tests/device_connect_server/test_rpc_handlers.py
+++ b/packages/device-connect-server/tests/device_connect_server/test_rpc_handlers.py
@@ -121,6 +121,7 @@ def mock_registry():
     with patch("device_connect_server.registry.service.main.registry") as mock_reg:
         mock_reg.register = MagicMock()
         mock_reg.list_devices = MagicMock(return_value=[])
+        mock_reg.list_devices_page = MagicMock(return_value=([], None, 0))
         mock_reg.get_device = MagicMock(return_value=None)
         mock_reg.refresh = MagicMock()
         mock_reg.update_status = MagicMock()
@@ -296,20 +297,49 @@ async def test_register_no_reply_returns_silently(self, messaging, mock_registry
 class TestListDevicesHandler:
 
     @pytest.mark.asyncio
-    async def test_list_devices_success(self, messaging, mock_registry):
+    async def test_list_devices_legacy_no_limit_returns_unbounded(self, messaging, mock_registry):
+        """Without ``limit`` the handler takes the legacy unpaged path."""
         mock_registry.list_devices.return_value = [SAMPLE_DEVICE]
         handler = _make_list_handler(TENANT, messaging)
-        data = _rpc_request("discovery/listDevices", {})
-        await handler(data, "reply-sub")
+        await handler(_rpc_request("discovery/listDevices", {}), "reply-sub")
 
+        # Legacy path: list_devices is called, list_devices_page is not.
         mock_registry.list_devices.assert_called_once_with(
             TENANT, device_type=None, location=None,
         )
+        mock_registry.list_devices_page.assert_not_called()
+
         response = json.loads(messaging.publish.call_args[0][1])
         assert response["result"]["devices"] == [SAMPLE_DEVICE]
+        # Legacy reply shape: no pagination metadata leaks out so old
+        # clients that ignore unknown keys aren't surprised by it.
+        assert "next_offset" not in response["result"]
+        assert "total_matched" not in response["result"]
+
+    @pytest.mark.asyncio
+    async def test_list_devices_with_limit_paginates(self, messaging, mock_registry):
+        """Passing ``limit`` opts the caller into the paged contract."""
+        mock_registry.list_devices_page.return_value = ([SAMPLE_DEVICE], 100, 250)
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"limit": 100, "offset": 0}),
+            "reply-sub",
+        )
+
+        mock_registry.list_devices_page.assert_called_once()
+        call_kwargs = mock_registry.list_devices_page.call_args.kwargs
+        assert call_kwargs["offset"] == 0
+        assert call_kwargs["limit"] == 100
+        mock_registry.list_devices.assert_not_called()
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["result"]["devices"] == [SAMPLE_DEVICE]
+        assert response["result"]["next_offset"] == 100
+        assert response["result"]["total_matched"] == 250
 
     @pytest.mark.asyncio
     async def test_list_devices_with_filters(self, messaging, mock_registry):
+        """Legacy filter forwarding (no ``limit`` -> unpaged path)."""
         mock_registry.list_devices.return_value = [SAMPLE_DEVICE]
         handler = _make_list_handler(TENANT, messaging)
         data = _rpc_request("discovery/listDevices", {
@@ -329,13 +359,15 @@ async def test_list_devices_empty(self, messaging, mock_registry):
 
         response = json.loads(messaging.publish.call_args[0][1])
         assert response["result"]["devices"] == []
+        # Legacy reply, so no total_matched.
+        assert "total_matched" not in response["result"]
 
     @pytest.mark.asyncio
     async def test_list_devices_with_acl(self, messaging, mock_registry):
+        """Legacy + ACL: filter shrinks the reply, no total_matched leaks."""
         mock_registry.list_devices.return_value = [SAMPLE_DEVICE, SAMPLE_DEVICE_2]
 
         acl_mgr = ACLManager()
-        # Hide camera-001 from robot-001
         acl_mgr.set_acl(DeviceACL(
             device_id="camera-001", tenant=TENANT,
             hidden_from=["robot-001"],
@@ -350,6 +382,231 @@ async def test_list_devices_with_acl(self, messaging, mock_registry):
         assert "camera-001" not in device_ids
         assert "robot-001" in device_ids
 
+    @pytest.mark.asyncio
+    async def test_list_devices_paged_acl_total_is_unfiltered(self, messaging, mock_registry):
+        """Documented caveat: total_matched reflects pre-ACL fleet size,
+        and the page shrinks below ``limit`` after ACL drops hidden rows.
+        """
+        # Server returns a full page of 2; ACL hides one.
+        mock_registry.list_devices_page.return_value = (
+            [SAMPLE_DEVICE, SAMPLE_DEVICE_2], None, 2,
+        )
+
+        acl_mgr = ACLManager()
+        acl_mgr.set_acl(DeviceACL(
+            device_id="camera-001", tenant=TENANT,
+            hidden_from=["robot-001"],
+        ))
+
+        handler = _make_list_handler(TENANT, messaging, acl_manager=acl_mgr)
+        await handler(
+            _rpc_request("discovery/listDevices", {
+                "limit": 2, "offset": 0, "requester_id": "robot-001",
+            }),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        # Page is shorter than limit because ACL dropped a row.
+        assert len(response["result"]["devices"]) == 1
+        # total_matched is the unfiltered count, intentionally larger.
+        assert response["result"]["total_matched"] == 2
+
+    @pytest.mark.asyncio
+    async def test_list_devices_invalid_offset_returns_error(self, messaging, mock_registry):
+        """Malformed offset surfaces a clean JSON-RPC -32602 error."""
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"offset": "abc", "limit": 10}),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["error"]["code"] == -32602
+        assert "offset" in response["error"]["message"]
+        mock_registry.list_devices_page.assert_not_called()
+        mock_registry.list_devices.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_list_devices_negative_offset_returns_error(self, messaging, mock_registry):
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"offset": -5, "limit": 10}),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["error"]["code"] == -32602
+        assert "non-negative" in response["error"]["message"]
+
+    @pytest.mark.asyncio
+    async def test_list_devices_invalid_limit_returns_error(self, messaging, mock_registry):
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"limit": "lots"}),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["error"]["code"] == -32602
+        assert "limit" in response["error"]["message"]
+        mock_registry.list_devices_page.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_list_devices_zero_limit_returns_error(self, messaging, mock_registry):
+        """``limit=0`` is rejected rather than silently mapped to the cap."""
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"limit": 0}),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["error"]["code"] == -32602
+        assert "positive" in response["error"]["message"]
+        mock_registry.list_devices_page.assert_not_called()
+        mock_registry.list_devices.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_list_devices_negative_limit_returns_error(self, messaging, mock_registry):
+        """Negative ``limit`` is rejected for the same reason as zero."""
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"limit": -3}),
+            "reply-sub",
+        )
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        assert response["error"]["code"] == -32602
+        assert "positive" in response["error"]["message"]
+        mock_registry.list_devices_page.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_list_devices_limit_above_cap_is_clamped(self, messaging, mock_registry):
+        """A client asking for more than ``_LIST_DEVICES_MAX_LIMIT`` is
+        silently clamped server-side; the wire response uses
+        ``next_offset`` to signal the page break so the client paginates
+        naturally rather than truncating."""
+        from device_connect_server.registry.service.main import (
+            _LIST_DEVICES_MAX_LIMIT,
+        )
+        # Registry returns exactly one page-worth (the cap); next_offset
+        # signals there is more behind it.
+        mock_registry.list_devices_page.return_value = (
+            [SAMPLE_DEVICE], _LIST_DEVICES_MAX_LIMIT, 5000,
+        )
+        handler = _make_list_handler(TENANT, messaging)
+        await handler(
+            _rpc_request("discovery/listDevices", {"limit": 10000, "offset": 0}),
+            "reply-sub",
+        )
+
+        # Handler must clamp ``limit`` before calling the registry.
+        call_kwargs = mock_registry.list_devices_page.call_args.kwargs
+        assert call_kwargs["limit"] == _LIST_DEVICES_MAX_LIMIT
+
+        response = json.loads(messaging.publish.call_args[0][1])
+        # Client sees a forward-pointing next_offset, not a 500 or a
+        # silent truncation.
+        assert response["result"]["next_offset"] == _LIST_DEVICES_MAX_LIMIT
+        assert response["result"]["total_matched"] == 5000
+
+    @pytest.mark.asyncio
+    async def test_list_devices_clamp_warns_once_per_requested_limit(
+        self, messaging, mock_registry, caplog,
+    ):
+        """An over-cap ``limit`` logs a warning on first sight but stays
+        quiet on repeated requests with the same value, so a misconfigured
+        client doesn't spam the log on every page."""
+        from device_connect_server.registry.service import main as registry_main
+
+        # Reset the dedup set so this test is independent of prior tests.
+        registry_main._WARNED_LIMIT_CLAMPS.clear()
+
+        mock_registry.list_devices_page.return_value = (
+            [SAMPLE_DEVICE], registry_main._LIST_DEVICES_MAX_LIMIT, 5000,
+        )
+        handler = _make_list_handler(TENANT, messaging)
+
+        with caplog.at_level("WARNING", logger="device_registry_service"):
+            # First over-cap request: must warn.
+            await handler(
+                _rpc_request("discovery/listDevices", {"limit": 10000}),
+                "reply-sub",
+            )
+            warnings_after_first = [
+                r for r in caplog.records
+                if r.levelname == "WARNING" and "clamped to server cap" in r.message
+            ]
+            assert len(warnings_after_first) == 1, (
+                f"expected one clamp warning, got "
+                f"{[r.message for r in caplog.records]}"
+            )
+            assert "10000" in warnings_after_first[0].message
+
+            # Second request with same limit: must NOT warn again.
+            caplog.clear()
+            await handler(
+                _rpc_request("discovery/listDevices", {"limit": 10000}),
+                "reply-sub",
+            )
+            warnings_after_second = [
+                r for r in caplog.records if "clamped to server cap" in r.message
+            ]
+            assert warnings_after_second == [], (
+                "repeated over-cap requests with the same limit must not "
+                "re-warn"
+            )
+
+            # Different over-cap limit: warns again (new value).
+            caplog.clear()
+            await handler(
+                _rpc_request("discovery/listDevices", {"limit": 5000}),
+                "reply-sub",
+            )
+            warnings_after_new = [
+                r for r in caplog.records if "clamped to server cap" in r.message
+            ]
+            assert len(warnings_after_new) == 1
+            assert "5000" in warnings_after_new[0].message
+
+    @pytest.mark.asyncio
+    async def test_list_devices_at_or_under_cap_does_not_warn(
+        self, messaging, mock_registry, caplog,
+    ):
+        """A limit at or below the cap is the intended path — no clamp,
+        no warning. Pins that the warning is gated strictly on
+        ``effective < requested``."""
+        from device_connect_server.registry.service import main as registry_main
+
+        registry_main._WARNED_LIMIT_CLAMPS.clear()
+        mock_registry.list_devices_page.return_value = (
+            [SAMPLE_DEVICE], None, 1,
+        )
+        handler = _make_list_handler(TENANT, messaging)
+
+        with caplog.at_level("WARNING", logger="device_registry_service"):
+            # Exactly at cap.
+            await handler(
+                _rpc_request("discovery/listDevices", {
+                    "limit": registry_main._LIST_DEVICES_MAX_LIMIT,
+                }),
+                "reply-sub",
+            )
+            # Well under cap.
+            await handler(
+                _rpc_request("discovery/listDevices", {"limit": 10}),
+                "reply-sub",
+            )
+
+        clamp_warnings = [
+            r for r in caplog.records if "clamped to server cap" in r.message
+        ]
+        assert clamp_warnings == [], (
+            f"under-cap requests must not warn; got: "
+            f"{[r.message for r in clamp_warnings]}"
+        )
+
     @pytest.mark.asyncio
     async def test_list_devices_registry_error(self, messaging, mock_registry):
         mock_registry.list_devices.side_effect = RuntimeError("etcd down")
@@ -367,6 +624,7 @@ async def test_discovery_no_reply_returns_silently(self, messaging, mock_registr
 
         messaging.publish.assert_not_called()
         mock_registry.list_devices.assert_not_called()
+        mock_registry.list_devices_page.assert_not_called()
 
 
 # ---------------------------------------------------------------------------