From 747936a14a3cfcf759e60ce2f9c4556f7e5600ae Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:04:01 +0000 Subject: [PATCH 01/29] feat(clickhouse): Add clickhouse-connect (HTTP) driver behind runtime config Introduce an HTTP-based ClickHouse client backed by `clickhouse-connect` as a drop-in alternative to the native `clickhouse-driver` protocol. - New `ClickhouseConnectPool` (snuba/clickhouse/connect.py) exposes the same `execute`/`execute_robust`/`close` interface as `ClickhousePool` and returns the same `ClickhouseResult`, so callers and the `NativeDriverReader` need no changes. - It relies on clickhouse-connect's built-in urllib3 connection pool (sized to `max_pool_size`) instead of maintaining our own LifoQueue, with sessions disabled so the shared client can serve concurrent queries. - `ClickhousePool` transparently dispatches each query to the HTTP pool when the `use_clickhouse_connect_driver` runtime config is enabled and an HTTP port is known. The default comes from the new `USE_CLICKHOUSE_CONNECT_DRIVER` setting (off by default), so the migration can be rolled out and rolled back without a deploy. - The cluster's HTTP port is threaded through `ConnectionCache` so the main query/read path can select the driver at runtime. - Error and concurrency-limit retry semantics mirror the native pool. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- pyproject.toml | 3 + snuba/clickhouse/connect.py | 339 +++++++++++++++++++++++++++++++ snuba/clickhouse/native.py | 79 +++++++ snuba/clusters/cluster.py | 9 + snuba/settings/__init__.py | 5 + tests/clickhouse/test_connect.py | 110 ++++++++++ 6 files changed, 545 insertions(+) create mode 100644 snuba/clickhouse/connect.py create mode 100644 tests/clickhouse/test_connect.py diff --git a/pyproject.toml b/pyproject.toml index 8b98a282ac5..e900a904597 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "26.7.0.dev0" dependencies = [ "blinker>=1.9", "click>=8.1.7", + "clickhouse-connect>=0.8.0", "clickhouse-driver>=0.2.10", "confluent-kafka>=2.7.0", "datadog>=0.49.1", @@ -125,6 +126,8 @@ exclude = ["^rust_snuba/", "^tests/datasets/", "^tests/query/"] [[tool.mypy.overrides]] module = [ "_strptime", + "clickhouse_connect", + "clickhouse_connect.*", "clickhouse_driver", "clickhouse_driver.errors", "confluent_kafka", diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py new file mode 100644 index 00000000000..64b7ce3b1bc --- /dev/null +++ b/snuba/clickhouse/connect.py @@ -0,0 +1,339 @@ +from __future__ import annotations + +import logging +import time +from threading import Lock +from typing import Any, Mapping, Optional, Sequence + +import clickhouse_connect +import sentry_sdk +from clickhouse_connect import common as clickhouse_connect_common +from clickhouse_connect.driver.client import Client +from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError +from clickhouse_connect.driver.httputil import get_pool_manager + +from snuba import environment, settings, state +from snuba.clickhouse.errors import ClickhouseError +from snuba.clickhouse.native import ClickhouseProfile, ClickhouseResult, Params +from snuba.utils.metrics.wrapper import MetricsWrapper + +logger = logging.getLogger("snuba.clickhouse.connect") + +metrics = MetricsWrapper(environment.metrics, "clickhouse.connect") + +# Error code returned by ClickHouse when the maximum number of simultaneous +# queries has been exceeded. Kept as a local constant to avoid depending on +# clickhouse_driver from the HTTP code path. +TOO_MANY_SIMULTANEOUS_QUERIES = 202 + +# clickhouse-connect raises a ProgrammingError by default when it is asked to +# send a setting it considers unknown or readonly. The native driver simply +# forwards whatever settings it is given to the server, so to preserve parity +# we tell clickhouse-connect to drop unrecognized settings instead of failing. +clickhouse_connect_common.set_setting("invalid_setting_action", "drop") + + +class ClickhouseConnectPool(object): + """ + HTTP based ClickHouse client backed by ``clickhouse-connect``. + + It exposes the same ``execute`` / ``execute_robust`` interface as + :class:`snuba.clickhouse.native.ClickhousePool` so it can be used as a + drop-in replacement behind a runtime config flag. + + Unlike the native pool, this class does not maintain its own queue of + connections: ``clickhouse-connect`` manages an HTTP connection pool (via + ``urllib3``) for us. A single :class:`Client` is created lazily and reused + across threads, with the underlying pool sized to ``max_pool_size``. + """ + + def __init__( + self, + host: str, + http_port: int, + user: str, + password: str, + database: str, + secure: bool = False, + ca_certs: Optional[str] = None, + verify: Optional[bool] = False, + connect_timeout: int = 1, + send_receive_timeout: Optional[int] = 35, + max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, + client_settings: Mapping[str, Any] = {}, + ) -> None: + self.host = host + self.http_port = http_port + self.user = user + self.password = password + self.database = database + self.secure = secure + self.ca_certs = ca_certs + self.verify = verify + self.connect_timeout = connect_timeout + self.send_receive_timeout = send_receive_timeout + self.max_pool_size = max_pool_size + self.client_settings = client_settings + + self.__client: Optional[Client] = None + self.__lock = Lock() + + def _get_client(self) -> Client: + # The client (and its handshake with the server) is created lazily so + # that simply constructing a pool does not open a connection. + if self.__client is None: + with self.__lock: + if self.__client is None: + pool_mgr = get_pool_manager( + ca_cert=self.ca_certs, + verify=bool(self.verify), + maxsize=self.max_pool_size, + # All requests go to a single host, so a single pool is + # enough. Keep a small margin for safety. + num_pools=2, + ) + self.__client = clickhouse_connect.get_client( + host=self.host, + port=self.http_port, + username=self.user, + password=self.password, + database=self.database, + interface="https" if self.secure else "http", + secure=self.secure, + verify=bool(self.verify), + ca_cert=self.ca_certs, + connect_timeout=self.connect_timeout, + send_receive_timeout=( + self.send_receive_timeout + if self.send_receive_timeout is not None + else 300 + ), + settings=dict(self.client_settings), + pool_mgr=pool_mgr, + # The native driver applies no implicit row limit; match + # that behavior here. + query_limit=0, + # Sessions serialize queries on the server. We share a + # single client across threads, so sessions must be + # disabled to allow concurrent queries. + autogenerate_session_id=False, + # We do our own retrying in execute()/execute_robust(). + query_retries=0, + ) + return self.__client + + def _build_query_settings( + self, + settings: Optional[Mapping[str, Any]], + query_id: Optional[str], + capture_trace: bool, + ) -> Optional[Mapping[str, Any]]: + query_settings = dict(settings) if settings else {} + if query_id is not None: + query_settings["query_id"] = query_id + if capture_trace: + query_settings["send_logs_level"] = "trace" + return query_settings or None + + def _execute_once( + self, + query: str, + params: Params, + with_column_types: bool, + query_id: Optional[str], + settings: Optional[Mapping[str, Any]], + columnar: bool, + capture_trace: bool, + ) -> ClickhouseResult: + client = self._get_client() + query_settings = self._build_query_settings(settings, query_id, capture_trace) + + with sentry_sdk.start_span(description=query, op="db.clickhouse") as span: + span.set_data(sentry_sdk.consts.SPANDATA.DB_SYSTEM, "clickhouse") + span.set_data("query_id", query_id) + span.set_data("settings", query_settings) + query_result = client.query( + query, + parameters=params if params else None, + settings=query_settings, + column_oriented=columnar, + ) + + summary = query_result.summary or {} + + def _int(key: str) -> int: + value = summary.get(key) + try: + return int(value) if value is not None else 0 + except (TypeError, ValueError): + return 0 + + elapsed_ns = summary.get("elapsed_ns") + try: + elapsed = float(elapsed_ns) / 1e9 if elapsed_ns is not None else 0.0 + except (TypeError, ValueError): + elapsed = 0.0 + + profile_data = ClickhouseProfile( + blocks=0, + bytes=_int("read_bytes"), + elapsed=elapsed, + progress_bytes=_int("read_bytes"), + rows=_int("read_rows"), + ) + + results: Sequence[Any] = query_result.result_set + + if with_column_types: + meta = [ + (name, column_type.name) + for name, column_type in zip(query_result.column_names, query_result.column_types) + ] + return ClickhouseResult( + results=results, + meta=meta, + profile=profile_data, + trace_output="", + ) + + return ClickhouseResult( + results=results, + profile=profile_data, + trace_output="", + ) + + def execute( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + """ + Execute a clickhouse query with a single quick retry in case of + connection failure. Mirrors ``ClickhousePool.execute``. + """ + attempts_remaining = 3 if retryable else 1 + + while attempts_remaining > 0: + attempts_remaining -= 1 + try: + return self._execute_once( + query, + params, + with_column_types, + query_id, + settings, + columnar, + capture_trace, + ) + except OperationalError as e: + metrics.increment( + "connection_error", + tags={ + "host": self.host, + "port": str(self.http_port), + "user": self.user, + "database": self.database, + }, + ) + if attempts_remaining <= 0: + raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e + # Short sleep to give the load balancer a chance to mark a bad + # host as down. + time.sleep(0.1) + except DatabaseError as e: + code = getattr(e, "code", None) + if code == TOO_MANY_SIMULTANEOUS_QUERIES: + if attempts_remaining <= 0: + raise ClickhouseError(str(e), code=code) from e + + sleep_interval_seconds = state.get_config( + "simultaneous_queries_sleep_seconds", None + ) + if not sleep_interval_seconds: + raise ClickhouseError(str(e), code=code) from e + + attempts_remaining = min(attempts_remaining, 1) # only retry once + time.sleep(sleep_interval_seconds) + continue + + raise ClickhouseError(str(e), code=code or -1) from e + + return ClickhouseResult() + + def execute_robust( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + """ + Execute a clickhouse query with a bit more tenacity, making more retry + attempts and waiting a second between retries. Mirrors + ``ClickhousePool.execute_robust``. + """ + total_attempts = 3 if retryable else 1 + attempts_remaining = total_attempts + + while True: + try: + return self.execute( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + ) + except OperationalError as e: + logger.warning( + "ClickHouse query execution failed: %s (%d tries left)", + str(e), + attempts_remaining, + ) + attempts_remaining -= 1 + if attempts_remaining <= 0: + raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e + time.sleep(1) + continue + except ClickhouseError as e: + logger.warning( + "ClickHouse query execution failed: %s (%d tries left)", + str(e), + attempts_remaining, + ) + if e.code == TOO_MANY_SIMULTANEOUS_QUERIES: + attempts_remaining -= 1 + if attempts_remaining <= 0: + raise e + sleep_interval_seconds = state.get_config( + "simultaneous_queries_sleep_seconds", 1 + ) + assert sleep_interval_seconds is not None + # Linear backoff. Adds one second at each iteration. + time.sleep( + float((total_attempts - attempts_remaining) * sleep_interval_seconds) + ) + continue + else: + # Quit immediately for other types of server errors. + raise e + + def close(self) -> None: + if self.__client is not None: + self.__client.close() + self.__client = None diff --git a/snuba/clickhouse/native.py b/snuba/clickhouse/native.py index a737a5e30d4..b8630783653 100644 --- a/snuba/clickhouse/native.py +++ b/snuba/clickhouse/native.py @@ -9,6 +9,7 @@ from datetime import date, datetime from io import StringIO from typing import ( + TYPE_CHECKING, Any, Dict, Generator, @@ -33,6 +34,9 @@ from snuba.utils.metrics.gauge import ThreadSafeGauge from snuba.utils.metrics.wrapper import MetricsWrapper +if TYPE_CHECKING: + from snuba.clickhouse.connect import ClickhouseConnectPool + ignore_logger("clickhouse_driver.connection") logger = logging.getLogger("snuba.clickhouse") @@ -88,6 +92,7 @@ def __init__( max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, pool_get_timeout_seconds: float = settings.CLICKHOUSE_POOL_GET_TIMEOUT_SECONDS, client_settings: Mapping[str, Any] = {}, + http_port: Optional[int] = None, ) -> None: self.host = host self.port = port @@ -101,14 +106,58 @@ def __init__( self.send_receive_timeout = send_receive_timeout self.pool_get_timeout_seconds = pool_get_timeout_seconds self.client_settings = client_settings + self.http_port = http_port + self.max_pool_size = max_pool_size self.pool: queue.LifoQueue[Optional[Client]] = queue.LifoQueue(max_pool_size) self.__gauge = ThreadSafeGauge(metrics, "connections") + # When the runtime config flag is enabled (and an HTTP port is known) + # queries are transparently routed to clickhouse-connect over HTTP + # instead of the native protocol. The connect pool is created lazily. + self.__connect_pool: Optional["ClickhouseConnectPool"] = None + # Fill the queue up so that doing get() on it will block properly for _ in range(max_pool_size): self.pool.put(None) + def _use_clickhouse_connect(self) -> bool: + """ + Whether queries should be routed through clickhouse-connect (HTTP) + instead of the native protocol. Controlled by a runtime config flag so + the migration can be rolled out (and rolled back) without a deploy. + + Only possible when the HTTP port is known for this connection. + """ + # ``getattr`` keeps this safe for test doubles that subclass + # ClickhousePool without calling ``__init__``. + if getattr(self, "http_port", None) is None: + return False + default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 + return bool(state.get_int_config("use_clickhouse_connect_driver", default)) + + def _get_connect_pool(self) -> "ClickhouseConnectPool": + if self.__connect_pool is None: + # Imported lazily to avoid a circular import at module load time. + from snuba.clickhouse.connect import ClickhouseConnectPool + + assert self.http_port is not None + self.__connect_pool = ClickhouseConnectPool( + host=self.host, + http_port=self.http_port, + user=self.user, + password=self.password, + database=self.database, + secure=self.secure, + ca_certs=self.ca_certs, + verify=self.verify, + connect_timeout=self.connect_timeout, + send_receive_timeout=self.send_receive_timeout, + max_pool_size=self.max_pool_size, + client_settings=self.client_settings, + ) + return self.__connect_pool + # This will actually return an int if an INSERT query is run, but we never capture the # output of INSERT queries so I left this as a Sequence. def execute( @@ -131,6 +180,19 @@ def execute( return relatively quickly with an error in case of more persistent failures. """ + if self._use_clickhouse_connect(): + return self._get_connect_pool().execute( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + retryable=retryable, + ) + try: conn = self.pool.get(block=True, timeout=self.pool_get_timeout_seconds) except queue.Empty: @@ -280,6 +342,19 @@ def execute_robust( query successfully or else quit altogether. Note that each retry in this loop will be doubled by the retry in execute() """ + if self._use_clickhouse_connect(): + return self._get_connect_pool().execute_robust( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + retryable=retryable, + ) + total_attempts = 3 if retryable else 1 attempts_remaining = total_attempts @@ -351,6 +426,10 @@ def _create_conn(self) -> Client: ) def close(self) -> None: + if self.__connect_pool is not None: + self.__connect_pool.close() + self.__connect_pool = None + try: while True: conn = self.pool.get(block=False) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 60fd982a62d..6ac7578c846 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -175,6 +175,7 @@ def get_batch_writer( bool, Optional[str], Optional[bool], + Optional[int], ] @@ -193,6 +194,7 @@ def get_node_connection( secure: bool, ca_certs: Optional[str], verify: Optional[bool], + http_port: Optional[int] = None, ) -> ClickhousePool: with self.__lock: settings, timeout = client_settings.value @@ -205,6 +207,7 @@ def get_node_connection( secure, ca_certs, verify, + http_port, ) if cache_key not in self.__cache: self.__cache[cache_key] = ClickhousePool( @@ -218,6 +221,7 @@ def get_node_connection( secure=secure, ca_certs=ca_certs, verify=verify, + http_port=http_port, ) return self.__cache[cache_key] @@ -327,6 +331,11 @@ def get_node_connection( self.__secure, self.__ca_certs, self.__verify, + # The HTTP port is uniform across the cluster, so it is safe to use + # the cluster's configured http_port for every node. This is what + # enables the clickhouse-connect (HTTP) code path to be selected at + # runtime; over the native protocol it is simply ignored. + self.__http_port, ) def get_deleter(self) -> Reader: diff --git a/snuba/settings/__init__.py b/snuba/settings/__init__.py index a1aebf43709..4ac7fe2434d 100644 --- a/snuba/settings/__init__.py +++ b/snuba/settings/__init__.py @@ -99,6 +99,11 @@ # pool.get() when ClickHouse is hung but not dropping connections. CLICKHOUSE_POOL_GET_TIMEOUT_SECONDS = 5 +# Default for routing ClickHouse queries through clickhouse-connect (over the +# HTTP protocol) instead of the native clickhouse-driver protocol. This default +# can be overridden at runtime with the `use_clickhouse_connect_driver` config. +USE_CLICKHOUSE_CONNECT_DRIVER = False + CLUSTERS: Sequence[Mapping[str, Any]] = [ { "host": os.environ.get("CLICKHOUSE_HOST", "127.0.0.1"), diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py new file mode 100644 index 00000000000..99a0c48e511 --- /dev/null +++ b/tests/clickhouse/test_connect.py @@ -0,0 +1,110 @@ +from typing import Any +from unittest import mock + +import pytest + +from snuba import state +from snuba.clickhouse.connect import ( + TOO_MANY_SIMULTANEOUS_QUERIES, + ClickhouseConnectPool, +) +from snuba.clickhouse.errors import ClickhouseError + + +class FakeColumnType: + def __init__(self, name: str) -> None: + self.name = name + + +class FakeQueryResult: + def __init__( + self, + result_set: Any, + column_names: Any = (), + column_types: Any = (), + summary: Any = None, + ) -> None: + self.result_set = result_set + self.column_names = column_names + self.column_types = column_types + self.summary = summary or {} + + +def _make_pool(client: mock.Mock) -> ClickhouseConnectPool: + pool = ClickhouseConnectPool( + host="host", + http_port=8123, + user="test", + password="test", + database="test", + ) + # Avoid creating a real client / connection. + pool._get_client = lambda: client # type: ignore[method-assign] + return pool + + +def test_execute_maps_result_and_profile() -> None: + client = mock.Mock() + client.query.return_value = FakeQueryResult( + result_set=[[1, "a"], [2, "b"]], + column_names=("id", "name"), + column_types=(FakeColumnType("UInt64"), FakeColumnType("String")), + summary={"read_rows": "2", "read_bytes": "128", "elapsed_ns": "1500000000"}, + ) + + pool = _make_pool(client) + result = pool.execute("SELECT id, name FROM t", with_column_types=True) + + assert result.results == [[1, "a"], [2, "b"]] + assert result.meta == [("id", "UInt64"), ("name", "String")] + assert result.profile is not None + assert result.profile["rows"] == 2 + assert result.profile["bytes"] == 128 + assert result.profile["elapsed"] == 1.5 + + +def test_execute_passes_query_id_and_settings() -> None: + client = mock.Mock() + client.query.return_value = FakeQueryResult(result_set=[]) + + pool = _make_pool(client) + pool.execute( + "SELECT 1", + query_id="my-query-id", + settings={"max_threads": 4}, + ) + + _, kwargs = client.query.call_args + assert kwargs["settings"]["query_id"] == "my-query-id" + assert kwargs["settings"]["max_threads"] == 4 + + +@pytest.mark.redis_db +def test_too_many_simultaneous_queries_retry() -> None: + from clickhouse_connect.driver.exceptions import DatabaseError + + state.set_config("simultaneous_queries_sleep_seconds", 0.01) + + client = mock.Mock() + client.query.side_effect = DatabaseError("too many", code=TOO_MANY_SIMULTANEOUS_QUERIES) + + pool = _make_pool(client) + with pytest.raises(ClickhouseError): + pool.execute("SELECT 1") + + # Initial attempt + one retry once the concurrency limit is hit. + assert client.query.call_count == 2 + + +@pytest.mark.redis_db +def test_operational_error_retries() -> None: + from clickhouse_connect.driver.exceptions import OperationalError + + client = mock.Mock() + client.query.side_effect = OperationalError("connection refused") + + pool = _make_pool(client) + with pytest.raises(ClickhouseError): + pool.execute("SELECT 1", retryable=True) + + assert client.query.call_count == 3 From 925c465b81d6c5c06c01a881a0cef9d7bbc2c7c4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:14:42 +0000 Subject: [PATCH 02/29] refactor(clickhouse): Delegate connect-pool retries to clickhouse-connect Drop the hand-rolled retry logic from ClickhouseConnectPool and rely solely on clickhouse-connect's built-in retry behavior (stale keep-alive sockets, transport errors, and HTTP 429/503/504). This also abandons the native pool's TOO_MANY_SIMULTANEOUS_QUERIES backoff for the HTTP path: clickhouse-connect does not retry that error (ClickHouse returns it as an HTTP 500 with an exception-code header, not a 429/503/504), so it is now surfaced directly to the caller as a ClickhouseError that preserves the code. execute_robust becomes a thin alias for execute, and the `retryable` argument is kept only for interface parity with ClickhousePool. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 159 ++++++++++--------------------- tests/clickhouse/test_connect.py | 32 ++++--- 2 files changed, 66 insertions(+), 125 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 64b7ce3b1bc..b8ff36a93ca 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -import time from threading import Lock from typing import Any, Mapping, Optional, Sequence @@ -12,7 +11,7 @@ from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError from clickhouse_connect.driver.httputil import get_pool_manager -from snuba import environment, settings, state +from snuba import environment, settings from snuba.clickhouse.errors import ClickhouseError from snuba.clickhouse.native import ClickhouseProfile, ClickhouseResult, Params from snuba.utils.metrics.wrapper import MetricsWrapper @@ -21,11 +20,6 @@ metrics = MetricsWrapper(environment.metrics, "clickhouse.connect") -# Error code returned by ClickHouse when the maximum number of simultaneous -# queries has been exceeded. Kept as a local constant to avoid depending on -# clickhouse_driver from the HTTP code path. -TOO_MANY_SIMULTANEOUS_QUERIES = 202 - # clickhouse-connect raises a ProgrammingError by default when it is asked to # send a setting it considers unknown or readonly. The native driver simply # forwards whatever settings it is given to the server, so to preserve parity @@ -117,8 +111,6 @@ def _get_client(self) -> Client: # single client across threads, so sessions must be # disabled to allow concurrent queries. autogenerate_session_id=False, - # We do our own retrying in execute()/execute_robust(). - query_retries=0, ) return self.__client @@ -215,57 +207,42 @@ def execute( retryable: bool = True, ) -> ClickhouseResult: """ - Execute a clickhouse query with a single quick retry in case of - connection failure. Mirrors ``ClickhousePool.execute``. + Execute a clickhouse query. + + Unlike :class:`snuba.clickhouse.native.ClickhousePool`, this method + does not implement any retry logic of its own. Retries (stale + keep-alive sockets, transport errors and HTTP 429/503/504 responses) + are handled internally by clickhouse-connect. Notably this means the + native pool's ``TOO_MANY_SIMULTANEOUS_QUERIES`` backoff is *not* + replicated: clickhouse-connect does not retry that error, so it is + surfaced directly to the caller. + + The ``retryable`` argument is accepted for interface parity with the + native pool but has no effect here. """ - attempts_remaining = 3 if retryable else 1 - - while attempts_remaining > 0: - attempts_remaining -= 1 - try: - return self._execute_once( - query, - params, - with_column_types, - query_id, - settings, - columnar, - capture_trace, - ) - except OperationalError as e: - metrics.increment( - "connection_error", - tags={ - "host": self.host, - "port": str(self.http_port), - "user": self.user, - "database": self.database, - }, - ) - if attempts_remaining <= 0: - raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e - # Short sleep to give the load balancer a chance to mark a bad - # host as down. - time.sleep(0.1) - except DatabaseError as e: - code = getattr(e, "code", None) - if code == TOO_MANY_SIMULTANEOUS_QUERIES: - if attempts_remaining <= 0: - raise ClickhouseError(str(e), code=code) from e - - sleep_interval_seconds = state.get_config( - "simultaneous_queries_sleep_seconds", None - ) - if not sleep_interval_seconds: - raise ClickhouseError(str(e), code=code) from e - - attempts_remaining = min(attempts_remaining, 1) # only retry once - time.sleep(sleep_interval_seconds) - continue - - raise ClickhouseError(str(e), code=code or -1) from e - - return ClickhouseResult() + try: + return self._execute_once( + query, + params, + with_column_types, + query_id, + settings, + columnar, + capture_trace, + ) + except OperationalError as e: + metrics.increment( + "connection_error", + tags={ + "host": self.host, + "port": str(self.http_port), + "user": self.user, + "database": self.database, + }, + ) + raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e + except DatabaseError as e: + raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e def execute_robust( self, @@ -280,58 +257,20 @@ def execute_robust( retryable: bool = True, ) -> ClickhouseResult: """ - Execute a clickhouse query with a bit more tenacity, making more retry - attempts and waiting a second between retries. Mirrors - ``ClickhousePool.execute_robust``. + Mirrors :meth:`ClickhousePool.execute_robust`. Since retries are + delegated to clickhouse-connect, this is equivalent to :meth:`execute`. """ - total_attempts = 3 if retryable else 1 - attempts_remaining = total_attempts - - while True: - try: - return self.execute( - query, - params=params, - with_column_types=with_column_types, - query_id=query_id, - settings=settings, - types_check=types_check, - columnar=columnar, - capture_trace=capture_trace, - ) - except OperationalError as e: - logger.warning( - "ClickHouse query execution failed: %s (%d tries left)", - str(e), - attempts_remaining, - ) - attempts_remaining -= 1 - if attempts_remaining <= 0: - raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e - time.sleep(1) - continue - except ClickhouseError as e: - logger.warning( - "ClickHouse query execution failed: %s (%d tries left)", - str(e), - attempts_remaining, - ) - if e.code == TOO_MANY_SIMULTANEOUS_QUERIES: - attempts_remaining -= 1 - if attempts_remaining <= 0: - raise e - sleep_interval_seconds = state.get_config( - "simultaneous_queries_sleep_seconds", 1 - ) - assert sleep_interval_seconds is not None - # Linear backoff. Adds one second at each iteration. - time.sleep( - float((total_attempts - attempts_remaining) * sleep_interval_seconds) - ) - continue - else: - # Quit immediately for other types of server errors. - raise e + return self.execute( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + retryable=retryable, + ) def close(self) -> None: if self.__client is not None: diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 99a0c48e511..bcedf516273 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -3,13 +3,13 @@ import pytest -from snuba import state -from snuba.clickhouse.connect import ( - TOO_MANY_SIMULTANEOUS_QUERIES, - ClickhouseConnectPool, -) +from snuba.clickhouse.connect import ClickhouseConnectPool from snuba.clickhouse.errors import ClickhouseError +# Error code returned by ClickHouse when the maximum number of simultaneous +# queries has been exceeded. +TOO_MANY_SIMULTANEOUS_QUERIES = 202 + class FakeColumnType: def __init__(self, name: str) -> None: @@ -79,25 +79,27 @@ def test_execute_passes_query_id_and_settings() -> None: assert kwargs["settings"]["max_threads"] == 4 -@pytest.mark.redis_db -def test_too_many_simultaneous_queries_retry() -> None: +def test_too_many_simultaneous_queries_not_retried() -> None: + # We delegate all retries to clickhouse-connect, which does not retry the + # TOO_MANY_SIMULTANEOUS_QUERIES error. It should be surfaced directly, + # mapped to a ClickhouseError that preserves the error code. from clickhouse_connect.driver.exceptions import DatabaseError - state.set_config("simultaneous_queries_sleep_seconds", 0.01) - client = mock.Mock() client.query.side_effect = DatabaseError("too many", code=TOO_MANY_SIMULTANEOUS_QUERIES) pool = _make_pool(client) - with pytest.raises(ClickhouseError): + with pytest.raises(ClickhouseError) as exc_info: pool.execute("SELECT 1") - # Initial attempt + one retry once the concurrency limit is hit. - assert client.query.call_count == 2 + assert exc_info.value.code == TOO_MANY_SIMULTANEOUS_QUERIES + # No retry on top of clickhouse-connect's own handling. + assert client.query.call_count == 1 -@pytest.mark.redis_db -def test_operational_error_retries() -> None: +def test_operational_error_mapped_without_extra_retries() -> None: + # Connection-level retries are clickhouse-connect's responsibility; we only + # map the surfaced error onto ClickhouseError without retrying again. from clickhouse_connect.driver.exceptions import OperationalError client = mock.Mock() @@ -107,4 +109,4 @@ def test_operational_error_retries() -> None: with pytest.raises(ClickhouseError): pool.execute("SELECT 1", retryable=True) - assert client.query.call_count == 3 + assert client.query.call_count == 1 From 15c08f253b7c756ab6895bfa19ed4a53d88303f6 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:19:00 +0000 Subject: [PATCH 03/29] fix(clickhouse): Match native exception handling and cap timeout at 30s - Wrap the full clickhouse-connect error family in the connect pool: catch the base ClickHouseError (DatabaseError, ProgrammingError, DataError, ...) in addition to OperationalError, and surface everything as a snuba ClickhouseError preserving the server code, mirroring how ClickhousePool wraps the clickhouse_driver errors.Error family. OperationalError still emits the connection_error metric like the native NetworkError path. - Cap the connect/send-receive timeouts at 30s. Settings profiles such as MIGRATE carry very large timeouts that are inappropriate for the HTTP query path, so they are clamped down (and an unset timeout defaults to 30s). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 28 +++++++++++--- tests/clickhouse/test_connect.py | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 5 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index b8ff36a93ca..481304444c7 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -8,7 +8,7 @@ import sentry_sdk from clickhouse_connect import common as clickhouse_connect_common from clickhouse_connect.driver.client import Client -from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError +from clickhouse_connect.driver.exceptions import ClickHouseError, OperationalError from clickhouse_connect.driver.httputil import get_pool_manager from snuba import environment, settings @@ -20,6 +20,11 @@ metrics = MetricsWrapper(environment.metrics, "clickhouse.connect") +# The native pool caps its send/receive timeout at 35s; for the HTTP path we +# cap it at 30s. Any larger (or unset) timeout coming from a client settings +# profile is clamped down to this value. +MAX_SEND_RECEIVE_TIMEOUT_SECONDS = 30 + # clickhouse-connect raises a ProgrammingError by default when it is asked to # send a setting it considers unknown or readonly. The native driver simply # forwards whatever settings it is given to the server, so to preserve parity @@ -96,11 +101,16 @@ def _get_client(self) -> Client: secure=self.secure, verify=bool(self.verify), ca_cert=self.ca_certs, - connect_timeout=self.connect_timeout, - send_receive_timeout=( + connect_timeout=min(self.connect_timeout, MAX_SEND_RECEIVE_TIMEOUT_SECONDS), + # Cap the read timeout at 30s regardless of what the + # client settings profile asks for (some profiles, e.g. + # MIGRATE, use very large timeouts that are not + # appropriate for the HTTP query path). + send_receive_timeout=min( self.send_receive_timeout if self.send_receive_timeout is not None - else 300 + else MAX_SEND_RECEIVE_TIMEOUT_SECONDS, + MAX_SEND_RECEIVE_TIMEOUT_SECONDS, ), settings=dict(self.client_settings), pool_mgr=pool_mgr, @@ -231,6 +241,9 @@ def execute( capture_trace, ) except OperationalError as e: + # Connection/transport level failures. Mirrors the native pool's + # handling of NetworkError/SocketTimeoutError by emitting the + # connection_error metric before surfacing the error. metrics.increment( "connection_error", tags={ @@ -241,7 +254,12 @@ def execute( }, ) raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e - except DatabaseError as e: + except ClickHouseError as e: + # ClickHouseError is the base class for every clickhouse-connect + # error (DatabaseError, ProgrammingError, DataError, ...). The + # native pool likewise wraps the whole clickhouse_driver errors.Error + # family into ClickhouseError, preserving the server error code when + # there is one. raise ClickhouseError(str(e), code=getattr(e, "code", None) or -1) from e def execute_robust( diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index bcedf516273..6db18dab223 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -110,3 +110,67 @@ def test_operational_error_mapped_without_extra_retries() -> None: pool.execute("SELECT 1", retryable=True) assert client.query.call_count == 1 + + +def test_generic_clickhouse_error_wrapped() -> None: + # Any clickhouse-connect error (here a ProgrammingError) must be wrapped in + # a snuba ClickhouseError, matching how the native pool wraps the whole + # clickhouse_driver errors.Error family. + from clickhouse_connect.driver.exceptions import ProgrammingError + + client = mock.Mock() + client.query.side_effect = ProgrammingError("bad query") + + pool = _make_pool(client) + with pytest.raises(ClickhouseError): + pool.execute("SELECT 1") + + assert client.query.call_count == 1 + + +def test_send_receive_timeout_capped_at_30s() -> None: + import clickhouse_connect + + pool = ClickhouseConnectPool( + host="host", + http_port=8123, + user="test", + password="test", + database="test", + connect_timeout=60, + # A large timeout (e.g. coming from the MIGRATE settings profile) must + # be clamped down to 30s on the HTTP path. + send_receive_timeout=300, + ) + + with ( + mock.patch.object(clickhouse_connect, "get_client") as get_client, + mock.patch("snuba.clickhouse.connect.get_pool_manager"), + ): + pool._get_client() + + _, kwargs = get_client.call_args + assert kwargs["send_receive_timeout"] == 30 + assert kwargs["connect_timeout"] == 30 + + +def test_send_receive_timeout_default_when_unset() -> None: + import clickhouse_connect + + pool = ClickhouseConnectPool( + host="host", + http_port=8123, + user="test", + password="test", + database="test", + send_receive_timeout=None, + ) + + with ( + mock.patch.object(clickhouse_connect, "get_client") as get_client, + mock.patch("snuba.clickhouse.connect.get_pool_manager"), + ): + pool._get_client() + + _, kwargs = get_client.call_args + assert kwargs["send_receive_timeout"] == 30 From 2cfaab9c06622cbc4db4c124b650b870b2434063 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:22:31 +0000 Subject: [PATCH 04/29] feat(clickhouse): Add runtime override for connect pool size The clickhouse-connect HTTP pool defaults to CLICKHOUSE_MAX_POOL_SIZE, the same setting used by the native pool. Add a `clickhouse_connect_pool_size` runtime config that overrides the urllib3 pool's maxsize when set. The value is read once when the (cached) client is first created. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 11 +++++++-- tests/clickhouse/test_connect.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 481304444c7..5bde276aefa 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -11,7 +11,7 @@ from clickhouse_connect.driver.exceptions import ClickHouseError, OperationalError from clickhouse_connect.driver.httputil import get_pool_manager -from snuba import environment, settings +from snuba import environment, settings, state from snuba.clickhouse.errors import ClickhouseError from snuba.clickhouse.native import ClickhouseProfile, ClickhouseResult, Params from snuba.utils.metrics.wrapper import MetricsWrapper @@ -83,10 +83,17 @@ def _get_client(self) -> Client: if self.__client is None: with self.__lock: if self.__client is None: + # Default to the configured CLICKHOUSE_MAX_POOL_SIZE, but + # allow it to be overridden at runtime. The value is read + # once when the (cached) client is first created. + pool_size = ( + state.get_int_config("clickhouse_connect_pool_size", self.max_pool_size) + or self.max_pool_size + ) pool_mgr = get_pool_manager( ca_cert=self.ca_certs, verify=bool(self.verify), - maxsize=self.max_pool_size, + maxsize=pool_size, # All requests go to a single host, so a single pool is # enough. Keep a small margin for safety. num_pools=2, diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 6db18dab223..5fef9f73572 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -174,3 +174,44 @@ def test_send_receive_timeout_default_when_unset() -> None: _, kwargs = get_client.call_args assert kwargs["send_receive_timeout"] == 30 + + +def test_pool_size_defaults_to_setting() -> None: + import clickhouse_connect + + from snuba import settings + + pool = ClickhouseConnectPool( + host="host", http_port=8123, user="test", password="test", database="test" + ) + + with ( + mock.patch.object(clickhouse_connect, "get_client"), + mock.patch("snuba.clickhouse.connect.get_pool_manager") as get_pool_manager, + ): + pool._get_client() + + _, kwargs = get_pool_manager.call_args + assert kwargs["maxsize"] == settings.CLICKHOUSE_MAX_POOL_SIZE + + +@pytest.mark.redis_db +def test_pool_size_runtime_override() -> None: + import clickhouse_connect + + from snuba import state + + state.set_config("clickhouse_connect_pool_size", 42) + + pool = ClickhouseConnectPool( + host="host", http_port=8123, user="test", password="test", database="test" + ) + + with ( + mock.patch.object(clickhouse_connect, "get_client"), + mock.patch("snuba.clickhouse.connect.get_pool_manager") as get_pool_manager, + ): + pool._get_client() + + _, kwargs = get_pool_manager.call_args + assert kwargs["maxsize"] == 42 From 0635e19ea91e4558907e273cb436d3fdefc3d8dd Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:34:51 +0000 Subject: [PATCH 05/29] refactor(clickhouse): Move driver selection out of the native pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep snuba/clickhouse/native.py purely the native driver — it no longer knows about clickhouse-connect. The choice of which driver to use is made one level higher, in the connection cache. - native.py is reverted to its original state (no http_port, no dispatch). - ClickhouseConnectPool now subclasses ClickhousePool and overrides execute/execute_robust/close, so it is a true drop-in with no type changes rippling through the callers (the dependency direction is connect -> native, which is correct). - ConnectionCache.get_node_connection reads the use_clickhouse_connect_driver runtime config (via the new cluster.use_clickhouse_connect_driver helper) and builds either a native or an HTTP pool, keyed by driver in the cache. - get_reader/get_deleter no longer cache their reader so the runtime toggle takes effect on the read path without a restart; the connections themselves remain cached by the connection cache. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 26 +++++--- snuba/clickhouse/native.py | 79 ---------------------- snuba/clusters/cluster.py | 109 ++++++++++++++++++++----------- tests/clickhouse/test_connect.py | 7 +- 4 files changed, 94 insertions(+), 127 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 5bde276aefa..644ae32c6db 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -13,7 +13,12 @@ from snuba import environment, settings, state from snuba.clickhouse.errors import ClickhouseError -from snuba.clickhouse.native import ClickhouseProfile, ClickhouseResult, Params +from snuba.clickhouse.native import ( + ClickhousePool, + ClickhouseProfile, + ClickhouseResult, + Params, +) from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger("snuba.clickhouse.connect") @@ -32,13 +37,15 @@ clickhouse_connect_common.set_setting("invalid_setting_action", "drop") -class ClickhouseConnectPool(object): +class ClickhouseConnectPool(ClickhousePool): """ HTTP based ClickHouse client backed by ``clickhouse-connect``. - It exposes the same ``execute`` / ``execute_robust`` interface as - :class:`snuba.clickhouse.native.ClickhousePool` so it can be used as a - drop-in replacement behind a runtime config flag. + It subclasses :class:`snuba.clickhouse.native.ClickhousePool` and overrides + the ``execute`` / ``execute_robust`` / ``close`` interface so it is a true + drop-in replacement. The decision of which pool to instantiate is made by + the connection cache (see :mod:`snuba.clusters.cluster`), one level above + the individual drivers. Unlike the native pool, this class does not maintain its own queue of connections: ``clickhouse-connect`` manages an HTTP connection pool (via @@ -61,8 +68,11 @@ def __init__( max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, client_settings: Mapping[str, Any] = {}, ) -> None: + # Intentionally does not call ClickhousePool.__init__: the native queue + # of connections is not used here. ``port`` mirrors the base class + # attribute (it holds the HTTP port for this driver). self.host = host - self.http_port = http_port + self.port = http_port self.user = user self.password = password self.database = database @@ -100,7 +110,7 @@ def _get_client(self) -> Client: ) self.__client = clickhouse_connect.get_client( host=self.host, - port=self.http_port, + port=self.port, username=self.user, password=self.password, database=self.database, @@ -255,7 +265,7 @@ def execute( "connection_error", tags={ "host": self.host, - "port": str(self.http_port), + "port": str(self.port), "user": self.user, "database": self.database, }, diff --git a/snuba/clickhouse/native.py b/snuba/clickhouse/native.py index b8630783653..a737a5e30d4 100644 --- a/snuba/clickhouse/native.py +++ b/snuba/clickhouse/native.py @@ -9,7 +9,6 @@ from datetime import date, datetime from io import StringIO from typing import ( - TYPE_CHECKING, Any, Dict, Generator, @@ -34,9 +33,6 @@ from snuba.utils.metrics.gauge import ThreadSafeGauge from snuba.utils.metrics.wrapper import MetricsWrapper -if TYPE_CHECKING: - from snuba.clickhouse.connect import ClickhouseConnectPool - ignore_logger("clickhouse_driver.connection") logger = logging.getLogger("snuba.clickhouse") @@ -92,7 +88,6 @@ def __init__( max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, pool_get_timeout_seconds: float = settings.CLICKHOUSE_POOL_GET_TIMEOUT_SECONDS, client_settings: Mapping[str, Any] = {}, - http_port: Optional[int] = None, ) -> None: self.host = host self.port = port @@ -106,58 +101,14 @@ def __init__( self.send_receive_timeout = send_receive_timeout self.pool_get_timeout_seconds = pool_get_timeout_seconds self.client_settings = client_settings - self.http_port = http_port - self.max_pool_size = max_pool_size self.pool: queue.LifoQueue[Optional[Client]] = queue.LifoQueue(max_pool_size) self.__gauge = ThreadSafeGauge(metrics, "connections") - # When the runtime config flag is enabled (and an HTTP port is known) - # queries are transparently routed to clickhouse-connect over HTTP - # instead of the native protocol. The connect pool is created lazily. - self.__connect_pool: Optional["ClickhouseConnectPool"] = None - # Fill the queue up so that doing get() on it will block properly for _ in range(max_pool_size): self.pool.put(None) - def _use_clickhouse_connect(self) -> bool: - """ - Whether queries should be routed through clickhouse-connect (HTTP) - instead of the native protocol. Controlled by a runtime config flag so - the migration can be rolled out (and rolled back) without a deploy. - - Only possible when the HTTP port is known for this connection. - """ - # ``getattr`` keeps this safe for test doubles that subclass - # ClickhousePool without calling ``__init__``. - if getattr(self, "http_port", None) is None: - return False - default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 - return bool(state.get_int_config("use_clickhouse_connect_driver", default)) - - def _get_connect_pool(self) -> "ClickhouseConnectPool": - if self.__connect_pool is None: - # Imported lazily to avoid a circular import at module load time. - from snuba.clickhouse.connect import ClickhouseConnectPool - - assert self.http_port is not None - self.__connect_pool = ClickhouseConnectPool( - host=self.host, - http_port=self.http_port, - user=self.user, - password=self.password, - database=self.database, - secure=self.secure, - ca_certs=self.ca_certs, - verify=self.verify, - connect_timeout=self.connect_timeout, - send_receive_timeout=self.send_receive_timeout, - max_pool_size=self.max_pool_size, - client_settings=self.client_settings, - ) - return self.__connect_pool - # This will actually return an int if an INSERT query is run, but we never capture the # output of INSERT queries so I left this as a Sequence. def execute( @@ -180,19 +131,6 @@ def execute( return relatively quickly with an error in case of more persistent failures. """ - if self._use_clickhouse_connect(): - return self._get_connect_pool().execute( - query, - params=params, - with_column_types=with_column_types, - query_id=query_id, - settings=settings, - types_check=types_check, - columnar=columnar, - capture_trace=capture_trace, - retryable=retryable, - ) - try: conn = self.pool.get(block=True, timeout=self.pool_get_timeout_seconds) except queue.Empty: @@ -342,19 +280,6 @@ def execute_robust( query successfully or else quit altogether. Note that each retry in this loop will be doubled by the retry in execute() """ - if self._use_clickhouse_connect(): - return self._get_connect_pool().execute_robust( - query, - params=params, - with_column_types=with_column_types, - query_id=query_id, - settings=settings, - types_check=types_check, - columnar=columnar, - capture_trace=capture_trace, - retryable=retryable, - ) - total_attempts = 3 if retryable else 1 attempts_remaining = total_attempts @@ -426,10 +351,6 @@ def _create_conn(self) -> Client: ) def close(self) -> None: - if self.__connect_pool is not None: - self.__connect_pool.close() - self.__connect_pool = None - try: while True: conn = self.pool.get(block=False) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 6ac7578c846..d4dca0c4d68 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -18,7 +18,7 @@ import structlog -from snuba import settings +from snuba import settings, state from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow from snuba.clickhouse.native import ClickhousePool, NativeDriverReader from snuba.clusters.storage_sets import ( @@ -176,9 +176,24 @@ def get_batch_writer( Optional[str], Optional[bool], Optional[int], + bool, ] +def use_clickhouse_connect_driver() -> bool: + """ + Whether new connections should use the clickhouse-connect (HTTP) driver + instead of the native protocol. This is the single place where the driver + is selected; the individual pools have no knowledge of one another. + + Controlled by a runtime config flag (defaulting to the + ``USE_CLICKHOUSE_CONNECT_DRIVER`` setting) so the migration can be rolled + out and rolled back without a deploy. + """ + default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 + return bool(state.get_int_config("use_clickhouse_connect_driver", default)) + + class ConnectionCache: def __init__(self) -> None: self.__cache: MutableMapping[CacheKey, ClickhousePool] = {} @@ -196,8 +211,10 @@ def get_node_connection( verify: Optional[bool], http_port: Optional[int] = None, ) -> ClickhousePool: + # The HTTP driver can only be used when the HTTP port is known. + use_connect = http_port is not None and use_clickhouse_connect_driver() with self.__lock: - settings, timeout = client_settings.value + client_settings_dict, timeout = client_settings.value cache_key = ( node, client_settings, @@ -208,21 +225,40 @@ def get_node_connection( ca_certs, verify, http_port, + use_connect, ) if cache_key not in self.__cache: - self.__cache[cache_key] = ClickhousePool( - node.host_name, - node.port, - user, - password, - database, - client_settings=settings, - send_receive_timeout=timeout, - secure=secure, - ca_certs=ca_certs, - verify=verify, - http_port=http_port, - ) + if use_connect: + assert http_port is not None + # Imported here so the native code path never imports the + # clickhouse-connect driver. + from snuba.clickhouse.connect import ClickhouseConnectPool + + self.__cache[cache_key] = ClickhouseConnectPool( + host=node.host_name, + http_port=http_port, + user=user, + password=password, + database=database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, + ) + else: + self.__cache[cache_key] = ClickhousePool( + node.host_name, + node.port, + user, + password, + database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, + ) return self.__cache[cache_key] @@ -287,8 +323,6 @@ def __init__( self.__single_node = single_node self.__cluster_name = cluster_name self.__distributed_cluster_name = distributed_cluster_name - self.__reader: Optional[Reader] = None - self.__deleter: Optional[Reader] = None self.__connection_cache = connection_cache self.__cache_partition_id = cache_partition_id self.__query_settings_prefix = query_settings_prefix @@ -332,32 +366,33 @@ def get_node_connection( self.__ca_certs, self.__verify, # The HTTP port is uniform across the cluster, so it is safe to use - # the cluster's configured http_port for every node. This is what - # enables the clickhouse-connect (HTTP) code path to be selected at - # runtime; over the native protocol it is simply ignored. + # the cluster's configured http_port for every node. The connection + # cache uses it to build a clickhouse-connect (HTTP) pool when the + # driver is enabled at runtime; otherwise it is unused. self.__http_port, ) def get_deleter(self) -> Reader: - if not self.__deleter: - # we need the connection to the storage nodes, not - # the distributed nodes - local_node = self.get_local_nodes()[0] - self.__deleter = NativeDriverReader( - cache_partition_id=f"{self.__cache_partition_id}_deletes", - client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), - query_settings_prefix=self.__query_settings_prefix, - ) - return self.__deleter + # Not cached: the underlying connection (and therefore the driver) is + # resolved on each call so that the clickhouse-connect runtime toggle + # takes effect without a restart. The connection itself is still cached + # by the connection cache. + # we need the connection to the storage nodes, not + # the distributed nodes + local_node = self.get_local_nodes()[0] + return NativeDriverReader( + cache_partition_id=f"{self.__cache_partition_id}_deletes", + client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), + query_settings_prefix=self.__query_settings_prefix, + ) def get_reader(self) -> Reader: - if not self.__reader: - self.__reader = NativeDriverReader( - cache_partition_id=self.__cache_partition_id, - client=self.get_query_connection(ClickhouseClientSettings.QUERY), - query_settings_prefix=self.__query_settings_prefix, - ) - return self.__reader + # Not cached: see get_deleter. + return NativeDriverReader( + cache_partition_id=self.__cache_partition_id, + client=self.get_query_connection(ClickhouseClientSettings.QUERY), + query_settings_prefix=self.__query_settings_prefix, + ) def get_batch_writer( self, diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 5fef9f73572..f55c3acda1d 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -89,10 +89,11 @@ def test_too_many_simultaneous_queries_not_retried() -> None: client.query.side_effect = DatabaseError("too many", code=TOO_MANY_SIMULTANEOUS_QUERIES) pool = _make_pool(client) - with pytest.raises(ClickhouseError) as exc_info: + try: pool.execute("SELECT 1") - - assert exc_info.value.code == TOO_MANY_SIMULTANEOUS_QUERIES + raise AssertionError("expected a ClickhouseError to be raised") + except ClickhouseError as error: + assert error.code == TOO_MANY_SIMULTANEOUS_QUERIES # No retry on top of clickhouse-connect's own handling. assert client.query.call_count == 1 From 740897e1b3f53838b0301b39941e926f3a9b27c2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:49:03 +0000 Subject: [PATCH 06/29] refactor(clickhouse): Introduce a driver-selecting pool Replace the per-call, cache-key-driven driver choice with a single DriverSelectingPool that owns both the native and the clickhouse-connect pool and routes each query to one of them based on the use_clickhouse_connect_driver runtime config. - Both underlying pools are now created once per connection target and cached together inside the selecting pool. Flipping the runtime config no longer creates a pool on the fly; it only changes which already-built pool a query is routed to. - DriverSelectingPool subclasses ClickhousePool (overriding execute/execute_robust/close), so the connection cache, the reader and all direct callers keep their existing ClickhousePool typing. The reader needs no driver knowledge: it just wraps the selecting connection. - get_reader/get_deleter cache their reader again, since dynamic driver switching now lives in the pool. - The use_clickhouse_connect_driver helper moves to snuba.clickhouse.connect alongside the pools that use it. native.py remains byte-for-byte identical to master. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- dump.rdb | Bin 0 -> 89 bytes snuba/clickhouse/connect.py | 99 ++++++++++++++++++++++++++++ snuba/clusters/cluster.py | 107 ++++++++++++++----------------- tests/clickhouse/test_connect.py | 74 ++++++++++++++++++++- 4 files changed, 221 insertions(+), 59 deletions(-) create mode 100644 dump.rdb diff --git a/dump.rdb b/dump.rdb new file mode 100644 index 0000000000000000000000000000000000000000..99858d1c55f1c93f2ae0f8cdf5f72ec9847a10a4 GIT binary patch literal 89 zcmWG?b@2=~FfcUu#aWb^l3A=5xUrsf_}ND^T9#gUkwrkj*loO*!a|EG4I#Sw9fP6GhI_al!0 literal 0 HcmV?d00001 diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 644ae32c6db..4383542d2e0 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -37,6 +37,19 @@ clickhouse_connect_common.set_setting("invalid_setting_action", "drop") +def use_clickhouse_connect_driver() -> bool: + """ + Whether queries should be routed through the clickhouse-connect (HTTP) + driver instead of the native protocol. + + Controlled by a runtime config flag (defaulting to the + ``USE_CLICKHOUSE_CONNECT_DRIVER`` setting) so the migration can be rolled + out and rolled back without a deploy. + """ + default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 + return bool(state.get_int_config("use_clickhouse_connect_driver", default)) + + class ClickhouseConnectPool(ClickhousePool): """ HTTP based ClickHouse client backed by ``clickhouse-connect``. @@ -311,3 +324,89 @@ def close(self) -> None: if self.__client is not None: self.__client.close() self.__client = None + + +class DriverSelectingPool(ClickhousePool): + """ + A :class:`ClickhousePool` that forwards each query to either the native or + the clickhouse-connect (HTTP) pool, chosen at query time from the + ``use_clickhouse_connect_driver`` runtime config. + + Both underlying pools are created once (up front) and reused. Flipping the + runtime config never creates a pool on the fly; it only changes which of + the two already-created pools a given query is routed to. + """ + + def __init__( + self, + native_pool: ClickhousePool, + connect_pool: ClickhouseConnectPool, + ) -> None: + # Intentionally does not call ClickhousePool.__init__: this pool owns no + # connections of its own, it only forwards to the two real pools. + self.__native_pool = native_pool + self.__connect_pool = connect_pool + # Expose the attributes callers read directly off a pool (host/port/...) + # from the native pool. + self.host = native_pool.host + self.port = native_pool.port + self.user = native_pool.user + self.password = native_pool.password + self.database = native_pool.database + + def _selected_pool(self) -> ClickhousePool: + if use_clickhouse_connect_driver(): + return self.__connect_pool + return self.__native_pool + + def execute( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + return self._selected_pool().execute( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + retryable=retryable, + ) + + def execute_robust( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + return self._selected_pool().execute_robust( + query, + params=params, + with_column_types=with_column_types, + query_id=query_id, + settings=settings, + types_check=types_check, + columnar=columnar, + capture_trace=capture_trace, + retryable=retryable, + ) + + def close(self) -> None: + self.__native_pool.close() + self.__connect_pool.close() diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index d4dca0c4d68..50bb1ac3dd0 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -18,7 +18,7 @@ import structlog -from snuba import settings, state +from snuba import settings from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow from snuba.clickhouse.native import ClickhousePool, NativeDriverReader from snuba.clusters.storage_sets import ( @@ -176,24 +176,9 @@ def get_batch_writer( Optional[str], Optional[bool], Optional[int], - bool, ] -def use_clickhouse_connect_driver() -> bool: - """ - Whether new connections should use the clickhouse-connect (HTTP) driver - instead of the native protocol. This is the single place where the driver - is selected; the individual pools have no knowledge of one another. - - Controlled by a runtime config flag (defaulting to the - ``USE_CLICKHOUSE_CONNECT_DRIVER`` setting) so the migration can be rolled - out and rolled back without a deploy. - """ - default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 - return bool(state.get_int_config("use_clickhouse_connect_driver", default)) - - class ConnectionCache: def __init__(self) -> None: self.__cache: MutableMapping[CacheKey, ClickhousePool] = {} @@ -211,8 +196,6 @@ def get_node_connection( verify: Optional[bool], http_port: Optional[int] = None, ) -> ClickhousePool: - # The HTTP driver can only be used when the HTTP port is known. - use_connect = http_port is not None and use_clickhouse_connect_driver() with self.__lock: client_settings_dict, timeout = client_settings.value cache_key = ( @@ -225,16 +208,35 @@ def get_node_connection( ca_certs, verify, http_port, - use_connect, ) if cache_key not in self.__cache: - if use_connect: - assert http_port is not None - # Imported here so the native code path never imports the - # clickhouse-connect driver. - from snuba.clickhouse.connect import ClickhouseConnectPool + native_pool = ClickhousePool( + node.host_name, + node.port, + user, + password, + database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, + ) + if http_port is None: + # No HTTP port: the connect driver can't be used, so there + # is nothing to select between. + self.__cache[cache_key] = native_pool + else: + # Build both pools once and let the selector decide, at + # query time, which one to route to. Imported here so that + # constructing the cache does not import clickhouse-connect + # until a connection is actually requested. + from snuba.clickhouse.connect import ( + ClickhouseConnectPool, + DriverSelectingPool, + ) - self.__cache[cache_key] = ClickhouseConnectPool( + connect_pool = ClickhouseConnectPool( host=node.host_name, http_port=http_port, user=user, @@ -246,19 +248,7 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - else: - self.__cache[cache_key] = ClickhousePool( - node.host_name, - node.port, - user, - password, - database, - client_settings=client_settings_dict, - send_receive_timeout=timeout, - secure=secure, - ca_certs=ca_certs, - verify=verify, - ) + self.__cache[cache_key] = DriverSelectingPool(native_pool, connect_pool) return self.__cache[cache_key] @@ -323,6 +313,8 @@ def __init__( self.__single_node = single_node self.__cluster_name = cluster_name self.__distributed_cluster_name = distributed_cluster_name + self.__reader: Optional[Reader] = None + self.__deleter: Optional[Reader] = None self.__connection_cache = connection_cache self.__cache_partition_id = cache_partition_id self.__query_settings_prefix = query_settings_prefix @@ -367,32 +359,31 @@ def get_node_connection( self.__verify, # The HTTP port is uniform across the cluster, so it is safe to use # the cluster's configured http_port for every node. The connection - # cache uses it to build a clickhouse-connect (HTTP) pool when the - # driver is enabled at runtime; otherwise it is unused. + # cache uses it to build the clickhouse-connect (HTTP) pool that + # backs the driver-selecting connection. self.__http_port, ) def get_deleter(self) -> Reader: - # Not cached: the underlying connection (and therefore the driver) is - # resolved on each call so that the clickhouse-connect runtime toggle - # takes effect without a restart. The connection itself is still cached - # by the connection cache. - # we need the connection to the storage nodes, not - # the distributed nodes - local_node = self.get_local_nodes()[0] - return NativeDriverReader( - cache_partition_id=f"{self.__cache_partition_id}_deletes", - client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), - query_settings_prefix=self.__query_settings_prefix, - ) + if not self.__deleter: + # we need the connection to the storage nodes, not + # the distributed nodes + local_node = self.get_local_nodes()[0] + self.__deleter = NativeDriverReader( + cache_partition_id=f"{self.__cache_partition_id}_deletes", + client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), + query_settings_prefix=self.__query_settings_prefix, + ) + return self.__deleter def get_reader(self) -> Reader: - # Not cached: see get_deleter. - return NativeDriverReader( - cache_partition_id=self.__cache_partition_id, - client=self.get_query_connection(ClickhouseClientSettings.QUERY), - query_settings_prefix=self.__query_settings_prefix, - ) + if not self.__reader: + self.__reader = NativeDriverReader( + cache_partition_id=self.__cache_partition_id, + client=self.get_query_connection(ClickhouseClientSettings.QUERY), + query_settings_prefix=self.__query_settings_prefix, + ) + return self.__reader def get_batch_writer( self, diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index f55c3acda1d..939c45bdf0a 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -3,7 +3,10 @@ import pytest -from snuba.clickhouse.connect import ClickhouseConnectPool +from snuba.clickhouse.connect import ( + ClickhouseConnectPool, + DriverSelectingPool, +) from snuba.clickhouse.errors import ClickhouseError # Error code returned by ClickHouse when the maximum number of simultaneous @@ -216,3 +219,72 @@ def test_pool_size_runtime_override() -> None: _, kwargs = get_pool_manager.call_args assert kwargs["maxsize"] == 42 + + +def _make_selecting_pool() -> tuple[DriverSelectingPool, mock.Mock, mock.Mock]: + native_pool = mock.Mock() + native_pool.host = "host" + native_pool.port = 9000 + native_pool.user = "test" + native_pool.password = "test" + native_pool.database = "test" + connect_pool = mock.Mock() + selecting = DriverSelectingPool(native_pool, connect_pool) + return selecting, native_pool, connect_pool + + +@pytest.mark.redis_db +def test_selecting_pool_routes_to_native_by_default() -> None: + selecting, native_pool, connect_pool = _make_selecting_pool() + + selecting.execute("SELECT 1") + selecting.execute_robust("SELECT 2") + + assert native_pool.execute.call_count == 1 + assert native_pool.execute_robust.call_count == 1 + assert connect_pool.execute.call_count == 0 + assert connect_pool.execute_robust.call_count == 0 + + +@pytest.mark.redis_db +def test_selecting_pool_routes_to_connect_when_enabled() -> None: + from snuba import state + + state.set_config("use_clickhouse_connect_driver", 1) + + selecting, native_pool, connect_pool = _make_selecting_pool() + + selecting.execute("SELECT 1") + selecting.execute_robust("SELECT 2") + + assert connect_pool.execute.call_count == 1 + assert connect_pool.execute_robust.call_count == 1 + assert native_pool.execute.call_count == 0 + assert native_pool.execute_robust.call_count == 0 + + +@pytest.mark.redis_db +def test_selecting_pool_switches_at_runtime_without_recreating() -> None: + from snuba import state + + selecting, native_pool, connect_pool = _make_selecting_pool() + + selecting.execute("native") + state.set_config("use_clickhouse_connect_driver", 1) + selecting.execute("connect") + state.set_config("use_clickhouse_connect_driver", 0) + selecting.execute("native again") + + # The same two underlying pools are reused throughout; only the routing + # changes. + assert native_pool.execute.call_count == 2 + assert connect_pool.execute.call_count == 1 + + +def test_selecting_pool_close_closes_both() -> None: + selecting, native_pool, connect_pool = _make_selecting_pool() + + selecting.close() + + assert native_pool.close.call_count == 1 + assert connect_pool.close.call_count == 1 From 1e8affa1c8c4fe629a413522e8db4c9a478d5ee9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 09:54:51 +0000 Subject: [PATCH 07/29] refactor(clickhouse): Always return a DriverSelectingPool from the cache The connection cache no longer splits between returning a bare ClickhousePool and a DriverSelectingPool. It always builds a DriverSelectingPool and lets that single object decide where each query goes. The selecting pool's connect pool is now optional: when the HTTP port is unknown the pool is built with connect_pool=None and every query is routed to the native pool. This keeps all routing logic in one place instead of splitting it between the cache and the pool. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 11 ++++++++--- snuba/clusters/cluster.py | 34 +++++++++++++++++--------------- tests/clickhouse/test_connect.py | 23 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 4383542d2e0..e0a08e15f13 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -335,12 +335,16 @@ class DriverSelectingPool(ClickhousePool): Both underlying pools are created once (up front) and reused. Flipping the runtime config never creates a pool on the fly; it only changes which of the two already-created pools a given query is routed to. + + ``connect_pool`` may be ``None`` when the HTTP port is not known for the + target. In that case every query is routed to the native pool, so the + cluster can always hand out a DriverSelectingPool without special casing. """ def __init__( self, native_pool: ClickhousePool, - connect_pool: ClickhouseConnectPool, + connect_pool: Optional[ClickhouseConnectPool], ) -> None: # Intentionally does not call ClickhousePool.__init__: this pool owns no # connections of its own, it only forwards to the two real pools. @@ -355,7 +359,7 @@ def __init__( self.database = native_pool.database def _selected_pool(self) -> ClickhousePool: - if use_clickhouse_connect_driver(): + if self.__connect_pool is not None and use_clickhouse_connect_driver(): return self.__connect_pool return self.__native_pool @@ -409,4 +413,5 @@ def execute_robust( def close(self) -> None: self.__native_pool.close() - self.__connect_pool.close() + if self.__connect_pool is not None: + self.__connect_pool.close() diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 50bb1ac3dd0..7107f545552 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -210,6 +210,13 @@ def get_node_connection( http_port, ) if cache_key not in self.__cache: + # Imported here so that importing this module does not import + # clickhouse-connect until a connection is actually requested. + from snuba.clickhouse.connect import ( + ClickhouseConnectPool, + DriverSelectingPool, + ) + native_pool = ClickhousePool( node.host_name, node.port, @@ -222,21 +229,11 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - if http_port is None: - # No HTTP port: the connect driver can't be used, so there - # is nothing to select between. - self.__cache[cache_key] = native_pool - else: - # Build both pools once and let the selector decide, at - # query time, which one to route to. Imported here so that - # constructing the cache does not import clickhouse-connect - # until a connection is actually requested. - from snuba.clickhouse.connect import ( - ClickhouseConnectPool, - DriverSelectingPool, - ) - - connect_pool = ClickhouseConnectPool( + # The connect pool can only be built when the HTTP port is + # known; otherwise the selecting pool simply always routes to + # the native pool. + connect_pool = ( + ClickhouseConnectPool( host=node.host_name, http_port=http_port, user=user, @@ -248,7 +245,12 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - self.__cache[cache_key] = DriverSelectingPool(native_pool, connect_pool) + if http_port is not None + else None + ) + # Always hand out a DriverSelectingPool and let it decide, at + # query time, which driver each query goes to. + self.__cache[cache_key] = DriverSelectingPool(native_pool, connect_pool) return self.__cache[cache_key] diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 939c45bdf0a..c80caf2ec23 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -288,3 +288,26 @@ def test_selecting_pool_close_closes_both() -> None: assert native_pool.close.call_count == 1 assert connect_pool.close.call_count == 1 + + +@pytest.mark.redis_db +def test_selecting_pool_without_connect_pool_always_native() -> None: + from snuba import state + + # No connect pool available (e.g. HTTP port unknown): queries must always + # go to native, even when the runtime flag is on. + state.set_config("use_clickhouse_connect_driver", 1) + + native_pool = mock.Mock() + native_pool.host = "host" + native_pool.port = 9000 + native_pool.user = "test" + native_pool.password = "test" + native_pool.database = "test" + selecting = DriverSelectingPool(native_pool, None) + + selecting.execute("SELECT 1") + selecting.close() # must not raise despite the missing connect pool + + assert native_pool.execute.call_count == 1 + assert native_pool.close.call_count == 1 From 2bf68557f2efc9985de0e9fefbacce99da7cb5c1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 10:01:40 +0000 Subject: [PATCH 08/29] refactor(clickhouse): Always use the default ClickHouse HTTP port The clickhouse-connect driver no longer takes a configurable HTTP port; it always connects on the default ClickHouse HTTP port (8123). This removes the http_port plumbing from ConnectionCache.get_node_connection and the cluster, and lets the connect pool always be built, so DriverSelectingPool's connect pool is no longer optional. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 20 ++++++++--------- snuba/clusters/cluster.py | 38 ++++++++++---------------------- tests/clickhouse/test_connect.py | 34 ++-------------------------- 3 files changed, 23 insertions(+), 69 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index e0a08e15f13..11bd7bf33c9 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -30,6 +30,10 @@ # profile is clamped down to this value. MAX_SEND_RECEIVE_TIMEOUT_SECONDS = 30 +# The clickhouse-connect driver always talks to the default ClickHouse HTTP +# port. The HTTP port is intentionally not configurable here. +DEFAULT_CLICKHOUSE_HTTP_PORT = 8123 + # clickhouse-connect raises a ProgrammingError by default when it is asked to # send a setting it considers unknown or readonly. The native driver simply # forwards whatever settings it is given to the server, so to preserve parity @@ -69,7 +73,6 @@ class ClickhouseConnectPool(ClickhousePool): def __init__( self, host: str, - http_port: int, user: str, password: str, database: str, @@ -83,9 +86,9 @@ def __init__( ) -> None: # Intentionally does not call ClickhousePool.__init__: the native queue # of connections is not used here. ``port`` mirrors the base class - # attribute (it holds the HTTP port for this driver). + # attribute (it holds the default HTTP port for this driver). self.host = host - self.port = http_port + self.port = DEFAULT_CLICKHOUSE_HTTP_PORT self.user = user self.password = password self.database = database @@ -335,16 +338,12 @@ class DriverSelectingPool(ClickhousePool): Both underlying pools are created once (up front) and reused. Flipping the runtime config never creates a pool on the fly; it only changes which of the two already-created pools a given query is routed to. - - ``connect_pool`` may be ``None`` when the HTTP port is not known for the - target. In that case every query is routed to the native pool, so the - cluster can always hand out a DriverSelectingPool without special casing. """ def __init__( self, native_pool: ClickhousePool, - connect_pool: Optional[ClickhouseConnectPool], + connect_pool: ClickhouseConnectPool, ) -> None: # Intentionally does not call ClickhousePool.__init__: this pool owns no # connections of its own, it only forwards to the two real pools. @@ -359,7 +358,7 @@ def __init__( self.database = native_pool.database def _selected_pool(self) -> ClickhousePool: - if self.__connect_pool is not None and use_clickhouse_connect_driver(): + if use_clickhouse_connect_driver(): return self.__connect_pool return self.__native_pool @@ -413,5 +412,4 @@ def execute_robust( def close(self) -> None: self.__native_pool.close() - if self.__connect_pool is not None: - self.__connect_pool.close() + self.__connect_pool.close() diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 7107f545552..7beb1a8f7ca 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -175,7 +175,6 @@ def get_batch_writer( bool, Optional[str], Optional[bool], - Optional[int], ] @@ -194,7 +193,6 @@ def get_node_connection( secure: bool, ca_certs: Optional[str], verify: Optional[bool], - http_port: Optional[int] = None, ) -> ClickhousePool: with self.__lock: client_settings_dict, timeout = client_settings.value @@ -207,7 +205,6 @@ def get_node_connection( secure, ca_certs, verify, - http_port, ) if cache_key not in self.__cache: # Imported here so that importing this module does not import @@ -229,24 +226,18 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - # The connect pool can only be built when the HTTP port is - # known; otherwise the selecting pool simply always routes to - # the native pool. - connect_pool = ( - ClickhouseConnectPool( - host=node.host_name, - http_port=http_port, - user=user, - password=password, - database=database, - client_settings=client_settings_dict, - send_receive_timeout=timeout, - secure=secure, - ca_certs=ca_certs, - verify=verify, - ) - if http_port is not None - else None + # The connect pool always talks to the default ClickHouse HTTP + # port (it is not configurable). + connect_pool = ClickhouseConnectPool( + host=node.host_name, + user=user, + password=password, + database=database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, ) # Always hand out a DriverSelectingPool and let it decide, at # query time, which driver each query goes to. @@ -359,11 +350,6 @@ def get_node_connection( self.__secure, self.__ca_certs, self.__verify, - # The HTTP port is uniform across the cluster, so it is safe to use - # the cluster's configured http_port for every node. The connection - # cache uses it to build the clickhouse-connect (HTTP) pool that - # backs the driver-selecting connection. - self.__http_port, ) def get_deleter(self) -> Reader: diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index c80caf2ec23..ef43aa54173 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -36,7 +36,6 @@ def __init__( def _make_pool(client: mock.Mock) -> ClickhouseConnectPool: pool = ClickhouseConnectPool( host="host", - http_port=8123, user="test", password="test", database="test", @@ -137,7 +136,6 @@ def test_send_receive_timeout_capped_at_30s() -> None: pool = ClickhouseConnectPool( host="host", - http_port=8123, user="test", password="test", database="test", @@ -163,7 +161,6 @@ def test_send_receive_timeout_default_when_unset() -> None: pool = ClickhouseConnectPool( host="host", - http_port=8123, user="test", password="test", database="test", @@ -185,9 +182,7 @@ def test_pool_size_defaults_to_setting() -> None: from snuba import settings - pool = ClickhouseConnectPool( - host="host", http_port=8123, user="test", password="test", database="test" - ) + pool = ClickhouseConnectPool(host="host", user="test", password="test", database="test") with ( mock.patch.object(clickhouse_connect, "get_client"), @@ -207,9 +202,7 @@ def test_pool_size_runtime_override() -> None: state.set_config("clickhouse_connect_pool_size", 42) - pool = ClickhouseConnectPool( - host="host", http_port=8123, user="test", password="test", database="test" - ) + pool = ClickhouseConnectPool(host="host", user="test", password="test", database="test") with ( mock.patch.object(clickhouse_connect, "get_client"), @@ -288,26 +281,3 @@ def test_selecting_pool_close_closes_both() -> None: assert native_pool.close.call_count == 1 assert connect_pool.close.call_count == 1 - - -@pytest.mark.redis_db -def test_selecting_pool_without_connect_pool_always_native() -> None: - from snuba import state - - # No connect pool available (e.g. HTTP port unknown): queries must always - # go to native, even when the runtime flag is on. - state.set_config("use_clickhouse_connect_driver", 1) - - native_pool = mock.Mock() - native_pool.host = "host" - native_pool.port = 9000 - native_pool.user = "test" - native_pool.password = "test" - native_pool.database = "test" - selecting = DriverSelectingPool(native_pool, None) - - selecting.execute("SELECT 1") - selecting.close() # must not raise despite the missing connect pool - - assert native_pool.execute.call_count == 1 - assert native_pool.close.call_count == 1 From 0500c2f5f2c55c173d6a0176b95e703115f3d874 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 10:15:14 +0000 Subject: [PATCH 09/29] refactor(clickhouse): Select reader (native vs HTTP) in get_reader Replace the DriverSelectingPool with a dedicated HTTPDriverReader that sits alongside NativeDriverReader, and move the driver choice up to ClickhouseCluster.get_reader. - HTTPDriverReader (connect.py) subclasses NativeDriverReader and wraps a ClickhouseConnectPool. Result handling is identical (both drivers return a ClickhouseResult); it exists as its own type so the driver is explicit. - get_reader picks NativeDriverReader or HTTPDriverReader based on the use_clickhouse_connect_driver runtime config; both readers are built at most once and cached, so flipping the config only changes which is returned. - The HTTP driver is now scoped to the read path only. Direct callers (migrations, replacer, optimize, ...) keep using the native pool returned by get_node_connection. - ConnectionCache caches native and HTTP pools side by side (driver in the cache key) via get_node_connection / get_http_node_connection. - use_clickhouse_connect_driver moves to cluster.py so the native path never imports clickhouse-connect just to read the flag. native.py remains byte-for-byte identical to master. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- dump.rdb | Bin 89 -> 89 bytes snuba/clickhouse/connect.py | 102 +++-------------------------- snuba/clusters/cluster.py | 108 ++++++++++++++++++++++++++----- tests/clickhouse/test_connect.py | 80 +++-------------------- tests/clusters/test_cluster.py | 37 +++++++++++ 5 files changed, 147 insertions(+), 180 deletions(-) diff --git a/dump.rdb b/dump.rdb index 99858d1c55f1c93f2ae0f8cdf5f72ec9847a10a4..76da9e1f351f10aa5866cd7e2a936c2f8081592f 100644 GIT binary patch delta 45 zcma!yoM5Q&>4 bool: - """ - Whether queries should be routed through the clickhouse-connect (HTTP) - driver instead of the native protocol. - - Controlled by a runtime config flag (defaulting to the - ``USE_CLICKHOUSE_CONNECT_DRIVER`` setting) so the migration can be rolled - out and rolled back without a deploy. - """ - default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 - return bool(state.get_int_config("use_clickhouse_connect_driver", default)) - - class ClickhouseConnectPool(ClickhousePool): """ HTTP based ClickHouse client backed by ``clickhouse-connect``. @@ -329,87 +317,13 @@ def close(self) -> None: self.__client = None -class DriverSelectingPool(ClickhousePool): +class HTTPDriverReader(NativeDriverReader): """ - A :class:`ClickhousePool` that forwards each query to either the native or - the clickhouse-connect (HTTP) pool, chosen at query time from the - ``use_clickhouse_connect_driver`` runtime config. + Reader that executes queries over HTTP via :class:`ClickhouseConnectPool`. - Both underlying pools are created once (up front) and reused. Flipping the - runtime config never creates a pool on the fly; it only changes which of - the two already-created pools a given query is routed to. + Both drivers return the same :class:`ClickhouseResult`, so the result + handling is identical to :class:`NativeDriverReader`. This class exists as + its own type so the driver in use is explicit and so the cluster can pick + between the two readers based on the ``use_clickhouse_connect_driver`` + runtime config. """ - - def __init__( - self, - native_pool: ClickhousePool, - connect_pool: ClickhouseConnectPool, - ) -> None: - # Intentionally does not call ClickhousePool.__init__: this pool owns no - # connections of its own, it only forwards to the two real pools. - self.__native_pool = native_pool - self.__connect_pool = connect_pool - # Expose the attributes callers read directly off a pool (host/port/...) - # from the native pool. - self.host = native_pool.host - self.port = native_pool.port - self.user = native_pool.user - self.password = native_pool.password - self.database = native_pool.database - - def _selected_pool(self) -> ClickhousePool: - if use_clickhouse_connect_driver(): - return self.__connect_pool - return self.__native_pool - - def execute( - self, - query: str, - params: Params = None, - with_column_types: bool = False, - query_id: Optional[str] = None, - settings: Optional[Mapping[str, Any]] = None, - types_check: bool = False, - columnar: bool = False, - capture_trace: bool = False, - retryable: bool = True, - ) -> ClickhouseResult: - return self._selected_pool().execute( - query, - params=params, - with_column_types=with_column_types, - query_id=query_id, - settings=settings, - types_check=types_check, - columnar=columnar, - capture_trace=capture_trace, - retryable=retryable, - ) - - def execute_robust( - self, - query: str, - params: Params = None, - with_column_types: bool = False, - query_id: Optional[str] = None, - settings: Optional[Mapping[str, Any]] = None, - types_check: bool = False, - columnar: bool = False, - capture_trace: bool = False, - retryable: bool = True, - ) -> ClickhouseResult: - return self._selected_pool().execute_robust( - query, - params=params, - with_column_types=with_column_types, - query_id=query_id, - settings=settings, - types_check=types_check, - columnar=columnar, - capture_trace=capture_trace, - retryable=retryable, - ) - - def close(self) -> None: - self.__native_pool.close() - self.__connect_pool.close() diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 7beb1a8f7ca..2a1d463e62c 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -18,7 +18,7 @@ import structlog -from snuba import settings +from snuba import settings, state from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow from snuba.clickhouse.native import ClickhousePool, NativeDriverReader from snuba.clusters.storage_sets import ( @@ -166,6 +166,21 @@ def get_batch_writer( ClickhouseWriterOptions = Optional[Mapping[str, Any]] +def use_clickhouse_connect_driver() -> bool: + """ + Whether the read path should use the clickhouse-connect (HTTP) driver + instead of the native protocol. + + Controlled by a runtime config flag (defaulting to the + ``USE_CLICKHOUSE_CONNECT_DRIVER`` setting) so the migration can be rolled + out and rolled back without a deploy. + """ + default = 1 if settings.USE_CLICKHOUSE_CONNECT_DRIVER else 0 + return bool(state.get_int_config("use_clickhouse_connect_driver", default)) + + +# The driver discriminator is part of the cache key so that the native and the +# HTTP pool for the same node can be cached side by side. CacheKey = Tuple[ ClickhouseNode, ClickhouseClientSettings, @@ -175,6 +190,7 @@ def get_batch_writer( bool, Optional[str], Optional[bool], + str, ] @@ -194,6 +210,7 @@ def get_node_connection( ca_certs: Optional[str], verify: Optional[bool], ) -> ClickhousePool: + """Return a cached native (clickhouse-driver) connection pool.""" with self.__lock: client_settings_dict, timeout = client_settings.value cache_key = ( @@ -205,16 +222,10 @@ def get_node_connection( secure, ca_certs, verify, + "native", ) if cache_key not in self.__cache: - # Imported here so that importing this module does not import - # clickhouse-connect until a connection is actually requested. - from snuba.clickhouse.connect import ( - ClickhouseConnectPool, - DriverSelectingPool, - ) - - native_pool = ClickhousePool( + self.__cache[cache_key] = ClickhousePool( node.host_name, node.port, user, @@ -226,9 +237,40 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - # The connect pool always talks to the default ClickHouse HTTP - # port (it is not configurable). - connect_pool = ClickhouseConnectPool( + + return self.__cache[cache_key] + + def get_http_node_connection( + self, + client_settings: ClickhouseClientSettings, + node: ClickhouseNode, + user: str, + password: str, + database: str, + secure: bool, + ca_certs: Optional[str], + verify: Optional[bool], + ) -> ClickhousePool: + """Return a cached clickhouse-connect (HTTP) connection pool.""" + # Imported here so that importing this module does not import + # clickhouse-connect until an HTTP connection is actually requested. + from snuba.clickhouse.connect import ClickhouseConnectPool + + with self.__lock: + client_settings_dict, timeout = client_settings.value + cache_key = ( + node, + client_settings, + user, + password, + database, + secure, + ca_certs, + verify, + "http", + ) + if cache_key not in self.__cache: + self.__cache[cache_key] = ClickhouseConnectPool( host=node.host_name, user=user, password=password, @@ -239,9 +281,6 @@ def get_node_connection( ca_certs=ca_certs, verify=verify, ) - # Always hand out a DriverSelectingPool and let it decide, at - # query time, which driver each query goes to. - self.__cache[cache_key] = DriverSelectingPool(native_pool, connect_pool) return self.__cache[cache_key] @@ -307,6 +346,7 @@ def __init__( self.__cluster_name = cluster_name self.__distributed_cluster_name = distributed_cluster_name self.__reader: Optional[Reader] = None + self.__http_reader: Optional[Reader] = None self.__deleter: Optional[Reader] = None self.__connection_cache = connection_cache self.__cache_partition_id = cache_partition_id @@ -352,6 +392,24 @@ def get_node_connection( self.__verify, ) + def __get_http_query_connection( + self, + client_settings: ClickhouseClientSettings, + ) -> ClickhousePool: + """ + Get a clickhouse-connect (HTTP) connection to the query node. + """ + return self.__connection_cache.get_http_node_connection( + client_settings, + self.__query_node, + self.__user, + self.__password, + self.__database, + self.__secure, + self.__ca_certs, + self.__verify, + ) + def get_deleter(self) -> Reader: if not self.__deleter: # we need the connection to the storage nodes, not @@ -365,6 +423,26 @@ def get_deleter(self) -> Reader: return self.__deleter def get_reader(self) -> Reader: + """ + Return a reader for the query node, choosing the HTTP + (clickhouse-connect) or the native driver based on the + ``use_clickhouse_connect_driver`` runtime config. Both readers are + built at most once and reused; flipping the config only changes which + of the two is returned. + """ + if use_clickhouse_connect_driver(): + if not self.__http_reader: + # Imported here so the native code path never imports + # clickhouse-connect. + from snuba.clickhouse.connect import HTTPDriverReader + + self.__http_reader = HTTPDriverReader( + cache_partition_id=self.__cache_partition_id, + client=self.__get_http_query_connection(ClickhouseClientSettings.QUERY), + query_settings_prefix=self.__query_settings_prefix, + ) + return self.__http_reader + if not self.__reader: self.__reader = NativeDriverReader( cache_partition_id=self.__cache_partition_id, diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index ef43aa54173..c999810ca1b 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -3,11 +3,9 @@ import pytest -from snuba.clickhouse.connect import ( - ClickhouseConnectPool, - DriverSelectingPool, -) +from snuba.clickhouse.connect import ClickhouseConnectPool, HTTPDriverReader from snuba.clickhouse.errors import ClickhouseError +from snuba.clickhouse.native import NativeDriverReader # Error code returned by ClickHouse when the maximum number of simultaneous # queries has been exceeded. @@ -214,70 +212,10 @@ def test_pool_size_runtime_override() -> None: assert kwargs["maxsize"] == 42 -def _make_selecting_pool() -> tuple[DriverSelectingPool, mock.Mock, mock.Mock]: - native_pool = mock.Mock() - native_pool.host = "host" - native_pool.port = 9000 - native_pool.user = "test" - native_pool.password = "test" - native_pool.database = "test" - connect_pool = mock.Mock() - selecting = DriverSelectingPool(native_pool, connect_pool) - return selecting, native_pool, connect_pool - - -@pytest.mark.redis_db -def test_selecting_pool_routes_to_native_by_default() -> None: - selecting, native_pool, connect_pool = _make_selecting_pool() - - selecting.execute("SELECT 1") - selecting.execute_robust("SELECT 2") - - assert native_pool.execute.call_count == 1 - assert native_pool.execute_robust.call_count == 1 - assert connect_pool.execute.call_count == 0 - assert connect_pool.execute_robust.call_count == 0 - - -@pytest.mark.redis_db -def test_selecting_pool_routes_to_connect_when_enabled() -> None: - from snuba import state - - state.set_config("use_clickhouse_connect_driver", 1) - - selecting, native_pool, connect_pool = _make_selecting_pool() - - selecting.execute("SELECT 1") - selecting.execute_robust("SELECT 2") - - assert connect_pool.execute.call_count == 1 - assert connect_pool.execute_robust.call_count == 1 - assert native_pool.execute.call_count == 0 - assert native_pool.execute_robust.call_count == 0 - - -@pytest.mark.redis_db -def test_selecting_pool_switches_at_runtime_without_recreating() -> None: - from snuba import state - - selecting, native_pool, connect_pool = _make_selecting_pool() - - selecting.execute("native") - state.set_config("use_clickhouse_connect_driver", 1) - selecting.execute("connect") - state.set_config("use_clickhouse_connect_driver", 0) - selecting.execute("native again") - - # The same two underlying pools are reused throughout; only the routing - # changes. - assert native_pool.execute.call_count == 2 - assert connect_pool.execute.call_count == 1 - - -def test_selecting_pool_close_closes_both() -> None: - selecting, native_pool, connect_pool = _make_selecting_pool() - - selecting.close() - - assert native_pool.close.call_count == 1 - assert connect_pool.close.call_count == 1 +def test_http_driver_reader_is_a_native_driver_reader() -> None: + # HTTPDriverReader reuses the native reader's result handling; it just + # exists as its own type so the driver in use is explicit. + pool = _make_pool(mock.Mock()) + reader = HTTPDriverReader(cache_partition_id=None, client=pool, query_settings_prefix=None) + assert isinstance(reader, NativeDriverReader) + assert isinstance(reader, HTTPDriverReader) diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index e8ef94dbdb6..861fd275437 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -260,6 +260,43 @@ def test_cache_connections() -> None: ) != cluster_3.get_query_connection(cluster.ClickhouseClientSettings.QUERY) +@pytest.mark.redis_db +@pytest.mark.clickhouse_db +def test_get_reader_selects_driver() -> None: + from snuba import state + from snuba.clickhouse.connect import HTTPDriverReader + from snuba.clickhouse.native import NativeDriverReader + + test_cluster = cluster.ClickhouseCluster( + "127.0.0.1", + 8000, + "default", + "", + "default", + 8001, + False, + None, + False, + {"events"}, + True, + ) + + # Default: native driver. + state.set_config("use_clickhouse_connect_driver", 0) + native_reader = test_cluster.get_reader() + assert isinstance(native_reader, NativeDriverReader) + assert not isinstance(native_reader, HTTPDriverReader) + + # Flip on at runtime: HTTP driver. + state.set_config("use_clickhouse_connect_driver", 1) + http_reader = test_cluster.get_reader() + assert isinstance(http_reader, HTTPDriverReader) + + # Flip back: the same cached native reader is returned (built at most once). + state.set_config("use_clickhouse_connect_driver", 0) + assert test_cluster.get_reader() is native_reader + + @patch("snuba.settings.SLICED_CLUSTERS", SLICED_CLUSTERS_CONFIG) @pytest.mark.clickhouse_db def test_sliced_cluster() -> None: From e124ee5b964266e40a2803f6ecbffb0c440f6070 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 10:16:23 +0000 Subject: [PATCH 10/29] style(tests): Fix pre-existing E712 comparisons in test_cluster Replace `== True` with `is True` for the is_single_node assertions flagged by ruff (E712). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- tests/clusters/test_cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index 861fd275437..ba0b3bcda3c 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -304,14 +304,14 @@ def test_sliced_cluster() -> None: res_cluster = cluster.get_cluster(StorageSetKey.GENERIC_METRICS_DISTRIBUTIONS, 1) - assert res_cluster.is_single_node() == True + assert res_cluster.is_single_node() is True assert res_cluster.get_database() == "slice_1_default" assert res_cluster.get_host() == "host_slice" assert res_cluster.get_port() == 9001 res_cluster_default = cluster.get_cluster(StorageSetKey.GENERIC_METRICS_DISTRIBUTIONS, 0) - assert res_cluster_default.is_single_node() == True + assert res_cluster_default.is_single_node() is True assert res_cluster_default.get_database() == "default" assert res_cluster_default.get_host() == "host_slice" assert res_cluster_default.get_port() == 9000 From 5bf8da9df311ebf2cd6bd28b8cbdb5db0fd8c961 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 10:25:59 +0000 Subject: [PATCH 11/29] refactor(clickhouse): Apply driver selection to all connections, not just reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the driver choice into ClickhouseCluster.get_node_connection so every caller picks up the runtime config, not only the read path. When use_clickhouse_connect_driver is enabled, get_node_connection (and therefore get_query_connection, migrations, replacer, optimize, ...) returns the clickhouse-connect HTTP pool; otherwise the native pool. get_reader/get_deleter now wrap whatever pool get_node_connection returns in the matching reader type (HTTPDriverReader or NativeDriverReader) via a small __build_reader helper. Readers are no longer cached — they are cheap to build and the config is read on each call, so the driver switches at runtime. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clusters/cluster.py | 103 ++++++++++++++++----------------- tests/clusters/test_cluster.py | 4 +- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 2a1d463e62c..d468f2834e6 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -345,9 +345,6 @@ def __init__( self.__single_node = single_node self.__cluster_name = cluster_name self.__distributed_cluster_name = distributed_cluster_name - self.__reader: Optional[Reader] = None - self.__http_reader: Optional[Reader] = None - self.__deleter: Optional[Reader] = None self.__connection_cache = connection_cache self.__cache_partition_id = cache_partition_id self.__query_settings_prefix = query_settings_prefix @@ -379,7 +376,24 @@ def get_node_connection( Get a Clickhouse connection using the client settings provided. Reuse any connection to the same node with the same settings otherwise establish a new connection. + + This is the single place where the driver is selected: when the + ``use_clickhouse_connect_driver`` runtime config is enabled the + clickhouse-connect (HTTP) pool is returned, otherwise the native pool. + The choice applies to every caller (reads, migrations, replacer, + optimize, ...), not just the read path. """ + if use_clickhouse_connect_driver(): + return self.__connection_cache.get_http_node_connection( + client_settings, + node, + self.__user, + self.__password, + self.__database, + self.__secure, + self.__ca_certs, + self.__verify, + ) return self.__connection_cache.get_node_connection( client_settings, @@ -392,64 +406,49 @@ def get_node_connection( self.__verify, ) - def __get_http_query_connection( - self, - client_settings: ClickhouseClientSettings, - ) -> ClickhousePool: + def __build_reader(self, cache_partition_id: Optional[str], client: ClickhousePool) -> Reader: """ - Get a clickhouse-connect (HTTP) connection to the query node. + Wrap a connection in the reader matching the driver currently selected + by the runtime config. Both reader classes adapt the same + ClickhouseResult, so they differ only in which pool they wrap. """ - return self.__connection_cache.get_http_node_connection( - client_settings, - self.__query_node, - self.__user, - self.__password, - self.__database, - self.__secure, - self.__ca_certs, - self.__verify, - ) + if use_clickhouse_connect_driver(): + # Imported here so the native code path never imports + # clickhouse-connect. + from snuba.clickhouse.connect import HTTPDriverReader - def get_deleter(self) -> Reader: - if not self.__deleter: - # we need the connection to the storage nodes, not - # the distributed nodes - local_node = self.get_local_nodes()[0] - self.__deleter = NativeDriverReader( - cache_partition_id=f"{self.__cache_partition_id}_deletes", - client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), + return HTTPDriverReader( + cache_partition_id=cache_partition_id, + client=client, query_settings_prefix=self.__query_settings_prefix, ) - return self.__deleter + + return NativeDriverReader( + cache_partition_id=cache_partition_id, + client=client, + query_settings_prefix=self.__query_settings_prefix, + ) + + def get_deleter(self) -> Reader: + # we need the connection to the storage nodes, not + # the distributed nodes + local_node = self.get_local_nodes()[0] + return self.__build_reader( + cache_partition_id=f"{self.__cache_partition_id}_deletes", + client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), + ) def get_reader(self) -> Reader: """ - Return a reader for the query node, choosing the HTTP - (clickhouse-connect) or the native driver based on the - ``use_clickhouse_connect_driver`` runtime config. Both readers are - built at most once and reused; flipping the config only changes which - of the two is returned. + Return a reader for the query node, using the HTTP (clickhouse-connect) + or the native driver based on the ``use_clickhouse_connect_driver`` + runtime config. The config is read on each call, so the driver can be + switched at runtime. """ - if use_clickhouse_connect_driver(): - if not self.__http_reader: - # Imported here so the native code path never imports - # clickhouse-connect. - from snuba.clickhouse.connect import HTTPDriverReader - - self.__http_reader = HTTPDriverReader( - cache_partition_id=self.__cache_partition_id, - client=self.__get_http_query_connection(ClickhouseClientSettings.QUERY), - query_settings_prefix=self.__query_settings_prefix, - ) - return self.__http_reader - - if not self.__reader: - self.__reader = NativeDriverReader( - cache_partition_id=self.__cache_partition_id, - client=self.get_query_connection(ClickhouseClientSettings.QUERY), - query_settings_prefix=self.__query_settings_prefix, - ) - return self.__reader + return self.__build_reader( + cache_partition_id=self.__cache_partition_id, + client=self.get_query_connection(ClickhouseClientSettings.QUERY), + ) def get_batch_writer( self, diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index ba0b3bcda3c..030e090bc34 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -292,9 +292,9 @@ def test_get_reader_selects_driver() -> None: http_reader = test_cluster.get_reader() assert isinstance(http_reader, HTTPDriverReader) - # Flip back: the same cached native reader is returned (built at most once). + # Flip back: native driver again. state.set_config("use_clickhouse_connect_driver", 0) - assert test_cluster.get_reader() is native_reader + assert not isinstance(test_cluster.get_reader(), HTTPDriverReader) @patch("snuba.settings.SLICED_CLUSTERS", SLICED_CLUSTERS_CONFIG) From a87be3186354e355ca3a79d6be721bbff7ccbfe5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 10:34:44 +0000 Subject: [PATCH 12/29] fix(clickhouse): Align connect timeouts with native; 30s read timeout Stop capping the clickhouse-connect timeout at 30s for every connection. The connect pool now honors the per-profile send_receive_timeout exactly like the native driver (falling back to a 300s default when a profile sets none). Express the read-query timeout policy in the shared QUERY client settings profile: it now uses a 30s timeout, which applies to both the native and the HTTP driver. Migrations (MIGRATE), OPTIMIZE and other long-running operations keep their existing default/longer timeouts. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 24 ++++++++++++------------ snuba/clusters/cluster.py | 4 +++- tests/clickhouse/test_connect.py | 21 ++++++++++++++------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 0d2ed4632c9..eab566e6c8a 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -26,10 +26,11 @@ metrics = MetricsWrapper(environment.metrics, "clickhouse.connect") -# The native pool caps its send/receive timeout at 35s; for the HTTP path we -# cap it at 30s. Any larger (or unset) timeout coming from a client settings -# profile is clamped down to this value. -MAX_SEND_RECEIVE_TIMEOUT_SECONDS = 30 +# Fallback send/receive timeout (seconds) used when a client settings profile +# does not specify one. Matches clickhouse-connect's own default. Per-profile +# timeouts (e.g. 30s for reads, longer for migrations) are honored as-is, the +# same way the native driver uses them. +DEFAULT_SEND_RECEIVE_TIMEOUT_SECONDS = 300 # The clickhouse-connect driver always talks to the default ClickHouse HTTP # port. The HTTP port is intentionally not configurable here. @@ -122,16 +123,15 @@ def _get_client(self) -> Client: secure=self.secure, verify=bool(self.verify), ca_cert=self.ca_certs, - connect_timeout=min(self.connect_timeout, MAX_SEND_RECEIVE_TIMEOUT_SECONDS), - # Cap the read timeout at 30s regardless of what the - # client settings profile asks for (some profiles, e.g. - # MIGRATE, use very large timeouts that are not - # appropriate for the HTTP query path). - send_receive_timeout=min( + connect_timeout=self.connect_timeout, + # Honor the per-profile timeout as-is, like the native + # driver does (reads get 30s, migrations/DDL keep their + # longer timeouts). Fall back to the default when a + # profile does not set one. + send_receive_timeout=( self.send_receive_timeout if self.send_receive_timeout is not None - else MAX_SEND_RECEIVE_TIMEOUT_SECONDS, - MAX_SEND_RECEIVE_TIMEOUT_SECONDS, + else DEFAULT_SEND_RECEIVE_TIMEOUT_SECONDS ), settings=dict(self.client_settings), pool_mgr=pool_mgr, diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index d468f2834e6..e7e84795450 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -65,7 +65,9 @@ class ClickhouseClientSettings(Enum): ) DELETE = ClickhouseClientSettingsType({"mutations_sync": 1}, None) OPTIMIZE = ClickhouseClientSettingsType({}, settings.OPTIMIZE_QUERY_TIMEOUT) - QUERY = ClickhouseClientSettingsType({}, None) + # Read queries get a 30s timeout. Migrations, DDL and other long-running + # operations keep their own (default or longer) timeouts above/below. + QUERY = ClickhouseClientSettingsType({}, 30) QUERYLOG = ClickhouseClientSettingsType({}, None) TRACING = ClickhouseClientSettingsType({"readonly": 2}, None) REPLACE = ClickhouseClientSettingsType( diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index c999810ca1b..ed5c592f657 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -129,18 +129,18 @@ def test_generic_clickhouse_error_wrapped() -> None: assert client.query.call_count == 1 -def test_send_receive_timeout_capped_at_30s() -> None: +def test_timeouts_are_passed_through() -> None: import clickhouse_connect + # The per-profile timeout is honored as-is (no capping), the same way the + # native driver uses it. A large timeout (e.g. from MIGRATE) is preserved. pool = ClickhouseConnectPool( host="host", user="test", password="test", database="test", connect_timeout=60, - # A large timeout (e.g. coming from the MIGRATE settings profile) must - # be clamped down to 30s on the HTTP path. - send_receive_timeout=300, + send_receive_timeout=300000, ) with ( @@ -150,8 +150,8 @@ def test_send_receive_timeout_capped_at_30s() -> None: pool._get_client() _, kwargs = get_client.call_args - assert kwargs["send_receive_timeout"] == 30 - assert kwargs["connect_timeout"] == 30 + assert kwargs["send_receive_timeout"] == 300000 + assert kwargs["connect_timeout"] == 60 def test_send_receive_timeout_default_when_unset() -> None: @@ -172,7 +172,14 @@ def test_send_receive_timeout_default_when_unset() -> None: pool._get_client() _, kwargs = get_client.call_args - assert kwargs["send_receive_timeout"] == 30 + assert kwargs["send_receive_timeout"] == 300 + + +def test_read_query_client_settings_use_30s_timeout() -> None: + # Read queries (the QUERY profile) get a 30s timeout on both drivers. + from snuba.clusters.cluster import ClickhouseClientSettings + + assert ClickhouseClientSettings.QUERY.value.timeout == 30 def test_pool_size_defaults_to_setting() -> None: From d8cd5e4170c04d0ccba8d1b2ebb3b2479c1784db Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:15:31 +0000 Subject: [PATCH 13/29] refactor(clickhouse): Abstract base classes for pools and readers Introduce a proper class hierarchy so the native and HTTP drivers are siblings rather than one extending the other: - ClickhousePool is now an abstract base (ABC) declaring the execute/execute_robust/close interface. The native implementation moves to a new ClickhouseNativePool(ClickhousePool); ClickhouseConnectPool also derives from the abstract base. The name ClickhousePool is kept as the abstract type so the ~50 type annotations referencing it are unchanged; only construction sites switch to ClickhouseNativePool. - Readers: a shared ClickhouseReader(Reader) holds the result-handling logic, with NativeDriverReader and HTTPDriverReader as thin siblings (HTTPDriverReader no longer subclasses NativeDriverReader). Updated all ClickhouseNativePool construction sites (cluster cache, admin common, cleanup/optimize/querylog CLIs, tests/fakes) and the admin credential-leak AST guard + mocks to target ClickhouseNativePool. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/common.py | 4 +- snuba/cli/cleanup.py | 5 +- snuba/cli/optimize.py | 5 +- snuba/cli/querylog_to_csv.py | 4 +- snuba/clickhouse/connect.py | 23 +++--- snuba/clickhouse/native.py | 70 ++++++++++++++++++- snuba/clusters/cluster.py | 8 ++- tests/admin/test_system_queries.py | 4 +- .../optimize/test_optimize_tracker.py | 8 +-- tests/clickhouse/test_connect.py | 12 ++-- tests/clickhouse/test_native.py | 8 +-- tests/clusters/fake_cluster.py | 6 +- tests/clusters/test_cluster.py | 4 +- tests/test_clickhouse.py | 4 +- .../test_cluster_loadinfo.py | 2 +- tests/web/test_cache_partitions.py | 4 +- 16 files changed, 122 insertions(+), 49 deletions(-) diff --git a/snuba/admin/clickhouse/common.py b/snuba/admin/clickhouse/common.py index 25c71940b75..e43975cd8e7 100644 --- a/snuba/admin/clickhouse/common.py +++ b/snuba/admin/clickhouse/common.py @@ -6,7 +6,7 @@ from sql_metadata import Parser, QueryType # type: ignore from snuba import settings -from snuba.clickhouse.native import ClickhousePool +from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool from snuba.clusters.cluster import ClickhouseClientSettings, ClickhouseCluster from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.storages.factory import get_storage @@ -96,7 +96,7 @@ def _build_validated_pool( # directly from this module. The regression test # test_no_direct_clickhouse_pool_construction_in_admin enforces this. _validate_node(clickhouse_host, clickhouse_port, cluster, storage_name) - return ClickhousePool( + return ClickhouseNativePool( clickhouse_host, clickhouse_port, username, diff --git a/snuba/cli/cleanup.py b/snuba/cli/cleanup.py index 10ce232eb62..711c3b68516 100644 --- a/snuba/cli/cleanup.py +++ b/snuba/cli/cleanup.py @@ -69,7 +69,7 @@ def cleanup( setup_sentry() from snuba.cleanup import logger, run_cleanup - from snuba.clickhouse.native import ClickhousePool + from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool storage = get_writable_storage(StorageKey(storage_name)) @@ -81,8 +81,9 @@ def cleanup( cluster = storage.get_cluster() database = cluster.get_database() + connection: ClickhousePool if clickhouse_host and clickhouse_port: - connection = ClickhousePool( + connection = ClickhouseNativePool( clickhouse_host, clickhouse_port, clickhouse_user, diff --git a/snuba/cli/optimize.py b/snuba/cli/optimize.py index f5d91dbe4a9..57dfe9a3dd7 100644 --- a/snuba/cli/optimize.py +++ b/snuba/cli/optimize.py @@ -78,7 +78,7 @@ def optimize( ) -> None: from datetime import datetime - from snuba.clickhouse.native import ClickhousePool + from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool from snuba.clickhouse.optimize.optimize import logger setup_logging(log_level) @@ -100,8 +100,9 @@ def optimize( # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. + connection: ClickhousePool if clickhouse_host and clickhouse_port: - connection = ClickhousePool( + connection = ClickhouseNativePool( clickhouse_host, clickhouse_port, clickhouse_user, diff --git a/snuba/cli/querylog_to_csv.py b/snuba/cli/querylog_to_csv.py index 497bde52738..9f1d3cd425f 100644 --- a/snuba/cli/querylog_to_csv.py +++ b/snuba/cli/querylog_to_csv.py @@ -7,7 +7,7 @@ from snuba import settings from snuba.admin.notifications.slack.client import SlackClient -from snuba.clickhouse.native import ClickhousePool +from snuba.clickhouse.native import ClickhouseNativePool from snuba.clusters.cluster import ClickhouseClientSettings from snuba.environment import setup_logging, setup_sentry @@ -160,7 +160,7 @@ def querylog_to_csv( query = get_query_results(event_type, [database], tables, start_time, end_time) (clickhouse_user, clickhouse_password) = get_credentials() - connection = ClickhousePool( + connection = ClickhouseNativePool( host=clickhouse_host, port=clickhouse_port, user=clickhouse_user, diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index eab566e6c8a..1edc5ed4dbe 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -16,8 +16,8 @@ from snuba.clickhouse.native import ( ClickhousePool, ClickhouseProfile, + ClickhouseReader, ClickhouseResult, - NativeDriverReader, Params, ) from snuba.utils.metrics.wrapper import MetricsWrapper @@ -73,9 +73,9 @@ def __init__( max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, client_settings: Mapping[str, Any] = {}, ) -> None: - # Intentionally does not call ClickhousePool.__init__: the native queue - # of connections is not used here. ``port`` mirrors the base class - # attribute (it holds the default HTTP port for this driver). + # No native connection queue here; clickhouse-connect manages its own + # HTTP pool. ``port`` is the abstract base attribute (it holds the + # default HTTP port for this driver). self.host = host self.port = DEFAULT_CLICKHOUSE_HTTP_PORT self.user = user @@ -240,8 +240,8 @@ def execute( """ Execute a clickhouse query. - Unlike :class:`snuba.clickhouse.native.ClickhousePool`, this method - does not implement any retry logic of its own. Retries (stale + Unlike :class:`snuba.clickhouse.native.ClickhouseNativePool`, this + method does not implement any retry logic of its own. Retries (stale keep-alive sockets, transport errors and HTTP 429/503/504 responses) are handled internally by clickhouse-connect. Notably this means the native pool's ``TOO_MANY_SIMULTANEOUS_QUERIES`` backoff is *not* @@ -317,13 +317,12 @@ def close(self) -> None: self.__client = None -class HTTPDriverReader(NativeDriverReader): +class HTTPDriverReader(ClickhouseReader): """ Reader that executes queries over HTTP via :class:`ClickhouseConnectPool`. - Both drivers return the same :class:`ClickhouseResult`, so the result - handling is identical to :class:`NativeDriverReader`. This class exists as - its own type so the driver in use is explicit and so the cluster can pick - between the two readers based on the ``use_clickhouse_connect_driver`` - runtime config. + It shares all result handling with :class:`NativeDriverReader` through the + common :class:`ClickhouseReader` base; it exists as its own type so the + driver in use is explicit and so the cluster can pick between the two + readers based on the ``use_clickhouse_connect_driver`` runtime config. """ diff --git a/snuba/clickhouse/native.py b/snuba/clickhouse/native.py index a737a5e30d4..df3d7f727ae 100644 --- a/snuba/clickhouse/native.py +++ b/snuba/clickhouse/native.py @@ -4,6 +4,7 @@ import queue import re import time +from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, field from datetime import date, datetime @@ -72,7 +73,61 @@ def capture_logging() -> Generator[StringIO, None, None]: buffer.close() -class ClickhousePool(object): +class ClickhousePool(ABC): + """ + Abstract base for a pool of ClickHouse connections. + + Concrete implementations: + - :class:`ClickhouseNativePool` — native protocol via clickhouse-driver + - ``ClickhouseConnectPool`` (snuba.clickhouse.connect) — HTTP protocol via + clickhouse-connect + + Callers receive connections typed as ``ClickhousePool`` and only rely on the + methods/attributes declared here, so the two drivers are interchangeable. + """ + + host: str + port: int + user: str + password: str + database: str + + @abstractmethod + def execute( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + raise NotImplementedError + + @abstractmethod + def execute_robust( + self, + query: str, + params: Params = None, + with_column_types: bool = False, + query_id: Optional[str] = None, + settings: Optional[Mapping[str, Any]] = None, + types_check: bool = False, + columnar: bool = False, + capture_trace: bool = False, + retryable: bool = True, + ) -> ClickhouseResult: + raise NotImplementedError + + @abstractmethod + def close(self) -> None: + raise NotImplementedError + + +class ClickhouseNativePool(ClickhousePool): def __init__( self, host: str, @@ -400,7 +455,14 @@ def transform_uuid(value: UUID) -> str: ) -class NativeDriverReader(Reader): +class ClickhouseReader(Reader): + """ + Shared base for the ClickHouse readers. It adapts a :class:`ClickhouseResult` + (returned identically by both drivers) into the JSON-flavored ``Result``. + The concrete :class:`NativeDriverReader` and ``HTTPDriverReader`` differ only + in which pool they wrap. + """ + def __init__( self, cache_partition_id: Optional[str], @@ -481,3 +543,7 @@ def execute( ), with_totals=with_totals, ) + + +class NativeDriverReader(ClickhouseReader): + """Reader that executes queries over the native protocol (ClickhouseNativePool).""" diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index e7e84795450..9565ddfc3c7 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -20,7 +20,11 @@ from snuba import settings, state from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow -from snuba.clickhouse.native import ClickhousePool, NativeDriverReader +from snuba.clickhouse.native import ( + ClickhouseNativePool, + ClickhousePool, + NativeDriverReader, +) from snuba.clusters.storage_sets import ( DEV_STORAGE_SETS, StorageSetKey, @@ -227,7 +231,7 @@ def get_node_connection( "native", ) if cache_key not in self.__cache: - self.__cache[cache_key] = ClickhousePool( + self.__cache[cache_key] = ClickhouseNativePool( node.host_name, node.port, user, diff --git a/tests/admin/test_system_queries.py b/tests/admin/test_system_queries.py index d395bc743fa..ba40adc5834 100644 --- a/tests/admin/test_system_queries.py +++ b/tests/admin/test_system_queries.py @@ -316,7 +316,7 @@ def test_clusterless_rejects_unvalidated_host( "_validate_node", side_effect=InvalidNodeError("host not in cluster"), ) as mock_validate, - patch.object(common, "ClickhousePool") as mock_pool, + patch.object(common, "ClickhouseNativePool") as mock_pool, ): # Clear any cached connection for this storage so the cache lookup # can't short-circuit validation. @@ -416,7 +416,7 @@ def visit_Call(self, node: ast.Call) -> None: if isinstance(func, ast.Attribute) else None ) - if name == "ClickhousePool": + if name == "ClickhouseNativePool": results.append((node, list(self.scope))) self.generic_visit(node) diff --git a/tests/clickhouse/optimize/test_optimize_tracker.py b/tests/clickhouse/optimize/test_optimize_tracker.py index ee03e5e38a9..406f55164a9 100644 --- a/tests/clickhouse/optimize/test_optimize_tracker.py +++ b/tests/clickhouse/optimize/test_optimize_tracker.py @@ -7,7 +7,7 @@ import pytest from snuba import settings -from snuba.clickhouse.native import ClickhousePool, ClickhouseResult +from snuba.clickhouse.native import ClickhouseNativePool, ClickhouseResult from snuba.clickhouse.optimize import optimize from snuba.clickhouse.optimize.optimize import run_optimize_cron_job from snuba.clickhouse.optimize.optimize_tracker import ( @@ -298,10 +298,10 @@ def test_merge_info() -> None: ] ) - with patch.object(ClickhousePool, "execute") as mock_clickhouse_execute: + with patch.object(ClickhouseNativePool, "execute") as mock_clickhouse_execute: mock_clickhouse_execute.return_value = merge_query_result merge_info = optimize.get_current_large_merges( - clickhouse=ClickhousePool("127.0.0.1", 9000, "user", "password", "database"), + clickhouse=ClickhouseNativePool("127.0.0.1", 9000, "user", "password", "database"), database="default", table="errors_local", ) @@ -322,7 +322,7 @@ def test_merge_info() -> None: assert merge_info[0].estimated_time == 8020.61436897 / (0.9895385071013121 + 0.0001) busy = optimize.is_busy_merging( - clickhouse=ClickhousePool("127.0.0.1", 9000, "user", "password", "database"), + clickhouse=ClickhouseNativePool("127.0.0.1", 9000, "user", "password", "database"), database="default", table="errors_local", ) diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index ed5c592f657..fee8745bd47 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -219,10 +219,12 @@ def test_pool_size_runtime_override() -> None: assert kwargs["maxsize"] == 42 -def test_http_driver_reader_is_a_native_driver_reader() -> None: - # HTTPDriverReader reuses the native reader's result handling; it just - # exists as its own type so the driver in use is explicit. +def test_http_driver_reader_is_a_clickhouse_reader_sibling() -> None: + # HTTPDriverReader and NativeDriverReader share the ClickhouseReader base + # but are siblings: HTTPDriverReader is NOT a NativeDriverReader. + from snuba.clickhouse.native import ClickhouseReader + pool = _make_pool(mock.Mock()) reader = HTTPDriverReader(cache_partition_id=None, client=pool, query_settings_prefix=None) - assert isinstance(reader, NativeDriverReader) - assert isinstance(reader, HTTPDriverReader) + assert isinstance(reader, ClickhouseReader) + assert not isinstance(reader, NativeDriverReader) diff --git a/tests/clickhouse/test_native.py b/tests/clickhouse/test_native.py index 1eabd3518f1..12df495bcc7 100644 --- a/tests/clickhouse/test_native.py +++ b/tests/clickhouse/test_native.py @@ -9,7 +9,7 @@ from snuba import state from snuba.clickhouse.errors import ClickhouseError -from snuba.clickhouse.native import ClickhousePool, transform_datetime +from snuba.clickhouse.native import ClickhouseNativePool, transform_datetime def test_transform_datetime() -> None: @@ -27,7 +27,7 @@ def test_robust_concurrency_limit() -> None: connection = mock.Mock() connection.execute.side_effect = ClickhouseError("some error", extra_data={"code": 1}) - pool = ClickhousePool("host", 100, "test", "test", "test") + pool = ClickhouseNativePool("host", 100, "test", "test", "test") pool.pool = queue.LifoQueue(1) pool.pool.put(connection, block=False) @@ -62,7 +62,7 @@ def test_concurrency_limit() -> None: state.set_config("simultaneous_queries_sleep_seconds", 0.5) - pool = ClickhousePool("host", 100, "test", "test", "test") + pool = ClickhouseNativePool("host", 100, "test", "test", "test") pool.pool = queue.LifoQueue(1) pool.pool.put(connection, block=False) @@ -100,7 +100,7 @@ def test_execute_retries(retryable: bool, expected: int) -> None: socket_timeout_connection = mock.Mock() socket_timeout_connection.execute.side_effect = errors.SocketTimeoutError - pool = ClickhousePool(CLUSTER_HOST, CLUSTER_PORT, "test", "test", TEST_DB_NAME) + pool = ClickhouseNativePool(CLUSTER_HOST, CLUSTER_PORT, "test", "test", TEST_DB_NAME) with mock.patch.object(pool, "_create_conn", lambda: socket_timeout_connection): pool.pool = queue.LifoQueue(1) diff --git a/tests/clusters/fake_cluster.py b/tests/clusters/fake_cluster.py index 4be4782d5b9..16b09404999 100644 --- a/tests/clusters/fake_cluster.py +++ b/tests/clusters/fake_cluster.py @@ -1,6 +1,6 @@ from typing import Any, List, Mapping, MutableMapping, Optional, Sequence, Set, Tuple -from snuba.clickhouse.native import ClickhousePool, ClickhouseResult, Params +from snuba.clickhouse.native import ClickhouseNativePool, ClickhouseResult, Params from snuba.clusters.cluster import ( ClickhouseClientSettings, ClickhouseCluster, @@ -13,7 +13,7 @@ class ServerExplodedException(SerializableException): pass -class FakeClickhousePool(ClickhousePool): +class FakeClickhousePool(ClickhouseNativePool): def __init__(self, host_name: str) -> None: self.__queries: List[str] = [] self.host = host_name @@ -128,7 +128,7 @@ def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, - ) -> ClickhousePool: + ) -> ClickhouseNativePool: settings, timeout = client_settings.value cache_key = (node, client_settings) if cache_key not in self.__connections: diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index 030e090bc34..f5a299a6f26 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -5,7 +5,7 @@ import pytest from snuba import settings -from snuba.clickhouse.native import ClickhousePool, ClickhouseResult +from snuba.clickhouse.native import ClickhouseNativePool, ClickhouseResult from snuba.clusters import cluster from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.storages.factory import get_storage @@ -177,7 +177,7 @@ def test_disabled_cluster() -> None: @pytest.mark.clickhouse_db def test_get_local_nodes() -> None: importlib.reload(cluster) - with patch.object(ClickhousePool, "execute") as execute: + with patch.object(ClickhouseNativePool, "execute") as execute: execute.return_value = ClickhouseResult([("host_1", 9000, 1, 1), ("host_2", 9000, 2, 1)]) local_cluster = get_storage(StorageKey("errors")).get_cluster() diff --git a/tests/test_clickhouse.py b/tests/test_clickhouse.py index 6278d374657..44880267110 100644 --- a/tests/test_clickhouse.py +++ b/tests/test_clickhouse.py @@ -5,7 +5,7 @@ from snuba.clickhouse.columns import Array, UInt from snuba.clickhouse.columns import SchemaModifiers as Modifier -from snuba.clickhouse.native import ClickhousePool +from snuba.clickhouse.native import ClickhouseNativePool from snuba.clusters.cluster import ClickhouseClientSettings from snuba.datasets.storages.factory import get_storage, get_writable_storage from snuba.datasets.storages.storage_key import StorageKey @@ -35,7 +35,7 @@ def test_reconnect(FakeClient: Client) -> None: errors.NetworkError, '{"data": "to my face"}', ] - cp = ClickhousePool("0:0:0:0", 9000, "default", "", "default") + cp = ClickhouseNativePool("0:0:0:0", 9000, "default", "", "default") cp.execute("SHOW TABLES") assert FakeClient.return_value.execute.mock_calls == [ call( diff --git a/tests/web/rpc/v1/routing_strategies/test_cluster_loadinfo.py b/tests/web/rpc/v1/routing_strategies/test_cluster_loadinfo.py index ade72da039e..871b89f1d20 100644 --- a/tests/web/rpc/v1/routing_strategies/test_cluster_loadinfo.py +++ b/tests/web/rpc/v1/routing_strategies/test_cluster_loadinfo.py @@ -42,7 +42,7 @@ def test_get_cluster_loadinfo_if_cache_fails() -> None: @pytest.mark.redis_db @pytest.mark.clickhouse_db def test_get_cluster_load_error_handling() -> None: - with patch("snuba.clusters.cluster.ClickhousePool.execute") as mock_execute: + with patch("snuba.clusters.cluster.ClickhouseNativePool.execute") as mock_execute: mock_execute.side_effect = Exception("Test error") load_info = get_cluster_loadinfo() assert load_info is not None diff --git a/tests/web/test_cache_partitions.py b/tests/web/test_cache_partitions.py index 82884b3780e..f98b1be623c 100644 --- a/tests/web/test_cache_partitions.py +++ b/tests/web/test_cache_partitions.py @@ -1,12 +1,12 @@ import pytest -from snuba.clickhouse.native import ClickhousePool, NativeDriverReader +from snuba.clickhouse.native import ClickhouseNativePool, NativeDriverReader from snuba.web.db_query import _get_cache_partition @pytest.mark.redis_db def test_cache_partition() -> None: - pool = ClickhousePool("127.0.0.1", 9000, "", "", "") + pool = ClickhouseNativePool("127.0.0.1", 9000, "", "", "") reader1 = NativeDriverReader(None, pool, None) reader2 = NativeDriverReader(None, pool, None) From 778ccc99a40973a9dcb2970da91e73c822a62c07 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:22:31 +0000 Subject: [PATCH 14/29] fix(clickhouse): Use the cluster's configured http_port for the connect driver The connect pool now connects on the cluster's configured http_port instead of a hardcoded 8123, matching HTTPBatchWriter and supporting non-default ports (e.g. 8229 in the distributed-migrations test settings). Unify ConnectionCache into a single get_node_connection that returns the abstract ClickhousePool and builds either the native or the clickhouse-connect pool based on a use_connect flag (passed by the cluster along with http_port). This removes the driver-specific get_http_node_connection in favor of one method typed against the abstract base. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 8 +-- snuba/clusters/cluster.py | 113 ++++++++++++++---------------------- 2 files changed, 47 insertions(+), 74 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 1edc5ed4dbe..a1524a37f8c 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -32,8 +32,7 @@ # same way the native driver uses them. DEFAULT_SEND_RECEIVE_TIMEOUT_SECONDS = 300 -# The clickhouse-connect driver always talks to the default ClickHouse HTTP -# port. The HTTP port is intentionally not configurable here. +# Default ClickHouse HTTP port, used when a caller does not pass one. DEFAULT_CLICKHOUSE_HTTP_PORT = 8123 # clickhouse-connect raises a ProgrammingError by default when it is asked to @@ -65,6 +64,7 @@ def __init__( user: str, password: str, database: str, + http_port: int = DEFAULT_CLICKHOUSE_HTTP_PORT, secure: bool = False, ca_certs: Optional[str] = None, verify: Optional[bool] = False, @@ -75,9 +75,9 @@ def __init__( ) -> None: # No native connection queue here; clickhouse-connect manages its own # HTTP pool. ``port`` is the abstract base attribute (it holds the - # default HTTP port for this driver). + # cluster's configured HTTP port for this driver). self.host = host - self.port = DEFAULT_CLICKHOUSE_HTTP_PORT + self.port = http_port self.user = user self.password = password self.database = database diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 9565ddfc3c7..22255e562c3 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -215,53 +215,15 @@ def get_node_connection( secure: bool, ca_certs: Optional[str], verify: Optional[bool], + http_port: int, + use_connect: bool, ) -> ClickhousePool: - """Return a cached native (clickhouse-driver) connection pool.""" - with self.__lock: - client_settings_dict, timeout = client_settings.value - cache_key = ( - node, - client_settings, - user, - password, - database, - secure, - ca_certs, - verify, - "native", - ) - if cache_key not in self.__cache: - self.__cache[cache_key] = ClickhouseNativePool( - node.host_name, - node.port, - user, - password, - database, - client_settings=client_settings_dict, - send_receive_timeout=timeout, - secure=secure, - ca_certs=ca_certs, - verify=verify, - ) - - return self.__cache[cache_key] - - def get_http_node_connection( - self, - client_settings: ClickhouseClientSettings, - node: ClickhouseNode, - user: str, - password: str, - database: str, - secure: bool, - ca_certs: Optional[str], - verify: Optional[bool], - ) -> ClickhousePool: - """Return a cached clickhouse-connect (HTTP) connection pool.""" - # Imported here so that importing this module does not import - # clickhouse-connect until an HTTP connection is actually requested. - from snuba.clickhouse.connect import ClickhouseConnectPool - + """ + Return a cached connection pool for the node, typed as the abstract + :class:`ClickhousePool`. When ``use_connect`` is set the + clickhouse-connect (HTTP) pool is built, otherwise the native one. Both + variants are cached side by side (the driver is part of the cache key). + """ with self.__lock: client_settings_dict, timeout = client_settings.value cache_key = ( @@ -273,20 +235,41 @@ def get_http_node_connection( secure, ca_certs, verify, - "http", + "http" if use_connect else "native", ) if cache_key not in self.__cache: - self.__cache[cache_key] = ClickhouseConnectPool( - host=node.host_name, - user=user, - password=password, - database=database, - client_settings=client_settings_dict, - send_receive_timeout=timeout, - secure=secure, - ca_certs=ca_certs, - verify=verify, - ) + pool: ClickhousePool + if use_connect: + # Imported here so that the native code path never imports + # clickhouse-connect. + from snuba.clickhouse.connect import ClickhouseConnectPool + + pool = ClickhouseConnectPool( + host=node.host_name, + http_port=http_port, + user=user, + password=password, + database=database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, + ) + else: + pool = ClickhouseNativePool( + node.host_name, + node.port, + user, + password, + database, + client_settings=client_settings_dict, + send_receive_timeout=timeout, + secure=secure, + ca_certs=ca_certs, + verify=verify, + ) + self.__cache[cache_key] = pool return self.__cache[cache_key] @@ -389,18 +372,6 @@ def get_node_connection( The choice applies to every caller (reads, migrations, replacer, optimize, ...), not just the read path. """ - if use_clickhouse_connect_driver(): - return self.__connection_cache.get_http_node_connection( - client_settings, - node, - self.__user, - self.__password, - self.__database, - self.__secure, - self.__ca_certs, - self.__verify, - ) - return self.__connection_cache.get_node_connection( client_settings, node, @@ -410,6 +381,8 @@ def get_node_connection( self.__secure, self.__ca_certs, self.__verify, + http_port=self.__http_port, + use_connect=use_clickhouse_connect_driver(), ) def __build_reader(self, cache_partition_id: Optional[str], client: ClickhousePool) -> Reader: From 8ecbb7f2fbd4f17e7f6eae965fcb8055e6f76431 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:28:15 +0000 Subject: [PATCH 15/29] perf(clickhouse): Cache deleter's local-node lookup get_deleter() no longer runs get_local_nodes() (a system.clusters query on multi-node clusters) on every call. The node lookup is static cluster topology, so it is cached once; the reader and pool are still resolved per call so the driver can switch at runtime. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clusters/cluster.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 22255e562c3..aaf1aa3e9a2 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -337,6 +337,9 @@ def __init__( self.__connection_cache = connection_cache self.__cache_partition_id = cache_partition_id self.__query_settings_prefix = query_settings_prefix + # The local node used by the deleter is static cluster topology; cache + # it so get_deleter() does not re-run a system.clusters lookup per call. + self.__delete_local_node: Optional[ClickhouseNode] = None def __str__(self) -> str: return str(self.__query_node) @@ -409,12 +412,17 @@ def __build_reader(self, cache_partition_id: Optional[str], client: ClickhousePo ) def get_deleter(self) -> Reader: - # we need the connection to the storage nodes, not - # the distributed nodes - local_node = self.get_local_nodes()[0] + # we need the connection to the storage nodes, not the distributed + # nodes. The node lookup is cached (it can run a system.clusters query + # on multi-node clusters) while the reader/pool are resolved per call so + # the driver can still switch at runtime. + if self.__delete_local_node is None: + self.__delete_local_node = self.get_local_nodes()[0] return self.__build_reader( cache_partition_id=f"{self.__cache_partition_id}_deletes", - client=self.get_node_connection(ClickhouseClientSettings.DELETE, local_node), + client=self.get_node_connection( + ClickhouseClientSettings.DELETE, self.__delete_local_node + ), ) def get_reader(self) -> Reader: From fac0556d8959cce31d6da7f010325f23d3d4c41b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:37:15 +0000 Subject: [PATCH 16/29] fix(clickhouse): Resolve driver once per get_reader/get_deleter get_reader and get_deleter previously read use_clickhouse_connect_driver twice (once when resolving the pool, once when picking the reader type), so a config flip between the reads could pair an HTTPDriverReader with a native pool (or vice versa). Resolve the flag once and thread it to both get_node_connection and the reader builder so they always agree. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clusters/cluster.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index aaf1aa3e9a2..33cd0981478 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -363,6 +363,7 @@ def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, + use_connect: Optional[bool] = None, ) -> ClickhousePool: """ Get a Clickhouse connection using the client settings provided. Reuse any @@ -374,7 +375,13 @@ def get_node_connection( clickhouse-connect (HTTP) pool is returned, otherwise the native pool. The choice applies to every caller (reads, migrations, replacer, optimize, ...), not just the read path. + + ``use_connect`` lets a caller resolve the driver once and reuse the same + value (e.g. so the reader and its pool can't disagree if the config + flips mid-call); when omitted the runtime config is read here. """ + if use_connect is None: + use_connect = use_clickhouse_connect_driver() return self.__connection_cache.get_node_connection( client_settings, node, @@ -385,16 +392,19 @@ def get_node_connection( self.__ca_certs, self.__verify, http_port=self.__http_port, - use_connect=use_clickhouse_connect_driver(), + use_connect=use_connect, ) - def __build_reader(self, cache_partition_id: Optional[str], client: ClickhousePool) -> Reader: + def __build_reader( + self, cache_partition_id: Optional[str], client: ClickhousePool, use_connect: bool + ) -> Reader: """ - Wrap a connection in the reader matching the driver currently selected - by the runtime config. Both reader classes adapt the same - ClickhouseResult, so they differ only in which pool they wrap. + Wrap a connection in the reader matching the selected driver. Both reader + classes adapt the same ClickhouseResult, so they differ only in which + pool they wrap. ``use_connect`` is resolved once by the caller so the + reader and the pool always agree. """ - if use_clickhouse_connect_driver(): + if use_connect: # Imported here so the native code path never imports # clickhouse-connect. from snuba.clickhouse.connect import HTTPDriverReader @@ -418,11 +428,16 @@ def get_deleter(self) -> Reader: # the driver can still switch at runtime. if self.__delete_local_node is None: self.__delete_local_node = self.get_local_nodes()[0] + # Resolve the driver once so the reader and its pool always match. + use_connect = use_clickhouse_connect_driver() return self.__build_reader( cache_partition_id=f"{self.__cache_partition_id}_deletes", client=self.get_node_connection( - ClickhouseClientSettings.DELETE, self.__delete_local_node + ClickhouseClientSettings.DELETE, + self.__delete_local_node, + use_connect=use_connect, ), + use_connect=use_connect, ) def get_reader(self) -> Reader: @@ -432,9 +447,14 @@ def get_reader(self) -> Reader: runtime config. The config is read on each call, so the driver can be switched at runtime. """ + # Resolve the driver once so the reader and its pool always match. + use_connect = use_clickhouse_connect_driver() return self.__build_reader( cache_partition_id=self.__cache_partition_id, - client=self.get_query_connection(ClickhouseClientSettings.QUERY), + client=self.get_node_connection( + ClickhouseClientSettings.QUERY, self.__query_node, use_connect=use_connect + ), + use_connect=use_connect, ) def get_batch_writer( From 30915ab31771784c779428a6ab92ebb5f01f1e94 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:43:37 +0000 Subject: [PATCH 17/29] fix(clickhouse): Match FakeClickhouseCluster override signature; drop stray dump.rdb - FakeClickhouseCluster.get_node_connection now accepts the use_connect parameter added to ClickhouseCluster.get_node_connection, fixing the mypy override-incompatibility caught by the pre-commit hook. - Remove dump.rdb (a Redis dump accidentally committed) from the repo and gitignore it. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- .gitignore | 1 + dump.rdb | Bin 89 -> 0 bytes tests/clusters/fake_cluster.py | 1 + 3 files changed, 2 insertions(+) delete mode 100644 dump.rdb diff --git a/.gitignore b/.gitignore index ec5476fa210..208fd5eeaaf 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ gocd/templates/vendor/ gocd/generated-pipelines/ Brewfile.lock.json .zed/ +dump.rdb diff --git a/dump.rdb b/dump.rdb deleted file mode 100644 index 76da9e1f351f10aa5866cd7e2a936c2f8081592f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 89 zcmWG?b@2=~FfcUu#aWb^l3A=5xUrsf_}s1suN#gUkwrkj*loO*!azez#%zk4+oya2@vBkceH diff --git a/tests/clusters/fake_cluster.py b/tests/clusters/fake_cluster.py index 16b09404999..aaf92f01952 100644 --- a/tests/clusters/fake_cluster.py +++ b/tests/clusters/fake_cluster.py @@ -128,6 +128,7 @@ def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, + use_connect: Optional[bool] = None, ) -> ClickhouseNativePool: settings, timeout = client_settings.value cache_key = (node, client_settings) From d88912b164bdc5fa0ac1605c29585c5b9675eea2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 11:56:02 +0000 Subject: [PATCH 18/29] refactor(clickhouse): Have pools build their own reader; cluster stays abstract Add ClickhousePool.get_reader() so each pool returns its matching reader: ClickhouseNativePool -> NativeDriverReader, ClickhouseConnectPool -> HTTPDriverReader. ClickhouseCluster.get_reader/get_deleter now just call pool.get_reader(), so the cluster only ever touches the abstract ClickhousePool and Reader -- it no longer imports or constructs the concrete reader classes. This also removes the cluster's __build_reader and the use_connect threading: because the reader is built from the pool, the reader and pool can never disagree about the driver, so the double config read (and its race) is gone. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 12 ++++++ snuba/clickhouse/native.py | 24 +++++++++++ snuba/clusters/cluster.py | 80 ++++++++----------------------------- 3 files changed, 53 insertions(+), 63 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index a1524a37f8c..a2d3b55aed2 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -20,6 +20,7 @@ ClickhouseResult, Params, ) +from snuba.reader import Reader from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger("snuba.clickhouse.connect") @@ -316,6 +317,17 @@ def close(self) -> None: self.__client.close() self.__client = None + def get_reader( + self, + cache_partition_id: Optional[str], + query_settings_prefix: Optional[str], + ) -> Reader: + return HTTPDriverReader( + cache_partition_id=cache_partition_id, + client=self, + query_settings_prefix=query_settings_prefix, + ) + class HTTPDriverReader(ClickhouseReader): """ diff --git a/snuba/clickhouse/native.py b/snuba/clickhouse/native.py index df3d7f727ae..104fee9d7d5 100644 --- a/snuba/clickhouse/native.py +++ b/snuba/clickhouse/native.py @@ -126,6 +126,19 @@ def execute_robust( def close(self) -> None: raise NotImplementedError + @abstractmethod + def get_reader( + self, + cache_partition_id: Optional[str], + query_settings_prefix: Optional[str], + ) -> Reader: + """ + Build the reader that pairs with this driver, wrapping this pool. Each + concrete pool returns its matching reader, so the reader and the pool + can never disagree about the driver. + """ + raise NotImplementedError + class ClickhouseNativePool(ClickhousePool): def __init__( @@ -414,6 +427,17 @@ def close(self) -> None: except queue.Empty: pass + def get_reader( + self, + cache_partition_id: Optional[str], + query_settings_prefix: Optional[str], + ) -> Reader: + return NativeDriverReader( + cache_partition_id=cache_partition_id, + client=self, + query_settings_prefix=query_settings_prefix, + ) + def transform_date(value: date) -> str: """ diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 33cd0981478..bbbd09dc309 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -20,11 +20,7 @@ from snuba import settings, state from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow -from snuba.clickhouse.native import ( - ClickhouseNativePool, - ClickhousePool, - NativeDriverReader, -) +from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool from snuba.clusters.storage_sets import ( DEV_STORAGE_SETS, StorageSetKey, @@ -363,7 +359,6 @@ def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, - use_connect: Optional[bool] = None, ) -> ClickhousePool: """ Get a Clickhouse connection using the client settings provided. Reuse any @@ -375,13 +370,7 @@ def get_node_connection( clickhouse-connect (HTTP) pool is returned, otherwise the native pool. The choice applies to every caller (reads, migrations, replacer, optimize, ...), not just the read path. - - ``use_connect`` lets a caller resolve the driver once and reuse the same - value (e.g. so the reader and its pool can't disagree if the config - flips mid-call); when omitted the runtime config is read here. """ - if use_connect is None: - use_connect = use_clickhouse_connect_driver() return self.__connection_cache.get_node_connection( client_settings, node, @@ -392,69 +381,34 @@ def get_node_connection( self.__ca_certs, self.__verify, http_port=self.__http_port, - use_connect=use_connect, - ) - - def __build_reader( - self, cache_partition_id: Optional[str], client: ClickhousePool, use_connect: bool - ) -> Reader: - """ - Wrap a connection in the reader matching the selected driver. Both reader - classes adapt the same ClickhouseResult, so they differ only in which - pool they wrap. ``use_connect`` is resolved once by the caller so the - reader and the pool always agree. - """ - if use_connect: - # Imported here so the native code path never imports - # clickhouse-connect. - from snuba.clickhouse.connect import HTTPDriverReader - - return HTTPDriverReader( - cache_partition_id=cache_partition_id, - client=client, - query_settings_prefix=self.__query_settings_prefix, - ) - - return NativeDriverReader( - cache_partition_id=cache_partition_id, - client=client, - query_settings_prefix=self.__query_settings_prefix, + use_connect=use_clickhouse_connect_driver(), ) def get_deleter(self) -> Reader: # we need the connection to the storage nodes, not the distributed # nodes. The node lookup is cached (it can run a system.clusters query - # on multi-node clusters) while the reader/pool are resolved per call so - # the driver can still switch at runtime. + # on multi-node clusters) while the connection/reader are resolved per + # call so the driver can still switch at runtime. The pool builds its + # own matching reader, so the reader and the pool always agree. if self.__delete_local_node is None: self.__delete_local_node = self.get_local_nodes()[0] - # Resolve the driver once so the reader and its pool always match. - use_connect = use_clickhouse_connect_driver() - return self.__build_reader( - cache_partition_id=f"{self.__cache_partition_id}_deletes", - client=self.get_node_connection( - ClickhouseClientSettings.DELETE, - self.__delete_local_node, - use_connect=use_connect, - ), - use_connect=use_connect, + return self.get_node_connection( + ClickhouseClientSettings.DELETE, self.__delete_local_node + ).get_reader( + f"{self.__cache_partition_id}_deletes", + self.__query_settings_prefix, ) def get_reader(self) -> Reader: """ - Return a reader for the query node, using the HTTP (clickhouse-connect) - or the native driver based on the ``use_clickhouse_connect_driver`` - runtime config. The config is read on each call, so the driver can be - switched at runtime. + Return a reader for the query node. The connection (native or HTTP) is + selected from the ``use_clickhouse_connect_driver`` runtime config and + builds its own matching reader, so the driver can be switched at runtime + and the reader and pool can never disagree. """ - # Resolve the driver once so the reader and its pool always match. - use_connect = use_clickhouse_connect_driver() - return self.__build_reader( - cache_partition_id=self.__cache_partition_id, - client=self.get_node_connection( - ClickhouseClientSettings.QUERY, self.__query_node, use_connect=use_connect - ), - use_connect=use_connect, + return self.get_query_connection(ClickhouseClientSettings.QUERY).get_reader( + self.__cache_partition_id, + self.__query_settings_prefix, ) def get_batch_writer( From 1c28b61019d44f9cd7e34a0f28b4ee54c45ced87 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:11:24 +0000 Subject: [PATCH 19/29] refactor(clickhouse): Collapse to a single ClickhouseReader; regenerate uv.lock The two reader subclasses were identical -- ClickhouseReader is driver-agnostic (it only uses the abstract ClickhousePool and transforms ClickhouseResult). So drop NativeDriverReader/HTTPDriverReader and ClickhousePool.get_reader(): the cluster wraps whichever pool get_node_connection selects in the single ClickhouseReader. The native-vs-HTTP choice stays at the pool level (ConnectionCache), and ClickhouseCluster touches only the abstract ClickhousePool plus the one reader. Also regenerate uv.lock now that clickhouse-connect (+ lz4, zstandard) is available in the internal PyPI mirror (getsentry/pypi#2273), which unblocks the dependency-install CI jobs. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 24 ----------- snuba/clickhouse/native.py | 36 ++--------------- snuba/clusters/cluster.py | 37 +++++++++-------- tests/clickhouse/test_connect.py | 12 +++--- tests/clusters/test_cluster.py | 29 +++++++------ tests/web/test_cache_partitions.py | 10 ++--- uv.lock | 65 ++++++++++++++++++++++++++---- 7 files changed, 110 insertions(+), 103 deletions(-) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index a2d3b55aed2..3046ce10b88 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -16,11 +16,9 @@ from snuba.clickhouse.native import ( ClickhousePool, ClickhouseProfile, - ClickhouseReader, ClickhouseResult, Params, ) -from snuba.reader import Reader from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger("snuba.clickhouse.connect") @@ -316,25 +314,3 @@ def close(self) -> None: if self.__client is not None: self.__client.close() self.__client = None - - def get_reader( - self, - cache_partition_id: Optional[str], - query_settings_prefix: Optional[str], - ) -> Reader: - return HTTPDriverReader( - cache_partition_id=cache_partition_id, - client=self, - query_settings_prefix=query_settings_prefix, - ) - - -class HTTPDriverReader(ClickhouseReader): - """ - Reader that executes queries over HTTP via :class:`ClickhouseConnectPool`. - - It shares all result handling with :class:`NativeDriverReader` through the - common :class:`ClickhouseReader` base; it exists as its own type so the - driver in use is explicit and so the cluster can pick between the two - readers based on the ``use_clickhouse_connect_driver`` runtime config. - """ diff --git a/snuba/clickhouse/native.py b/snuba/clickhouse/native.py index 104fee9d7d5..7ea3f40a315 100644 --- a/snuba/clickhouse/native.py +++ b/snuba/clickhouse/native.py @@ -126,19 +126,6 @@ def execute_robust( def close(self) -> None: raise NotImplementedError - @abstractmethod - def get_reader( - self, - cache_partition_id: Optional[str], - query_settings_prefix: Optional[str], - ) -> Reader: - """ - Build the reader that pairs with this driver, wrapping this pool. Each - concrete pool returns its matching reader, so the reader and the pool - can never disagree about the driver. - """ - raise NotImplementedError - class ClickhouseNativePool(ClickhousePool): def __init__( @@ -427,17 +414,6 @@ def close(self) -> None: except queue.Empty: pass - def get_reader( - self, - cache_partition_id: Optional[str], - query_settings_prefix: Optional[str], - ) -> Reader: - return NativeDriverReader( - cache_partition_id=cache_partition_id, - client=self, - query_settings_prefix=query_settings_prefix, - ) - def transform_date(value: date) -> str: """ @@ -481,10 +457,10 @@ def transform_uuid(value: UUID) -> str: class ClickhouseReader(Reader): """ - Shared base for the ClickHouse readers. It adapts a :class:`ClickhouseResult` - (returned identically by both drivers) into the JSON-flavored ``Result``. - The concrete :class:`NativeDriverReader` and ``HTTPDriverReader`` differ only - in which pool they wrap. + Reader for ClickHouse queries. It adapts a :class:`ClickhouseResult` into the + JSON-flavored ``Result``. It is driver-agnostic: it wraps the abstract + :class:`ClickhousePool`, so the same reader works for both the native and the + clickhouse-connect (HTTP) pools. """ def __init__( @@ -567,7 +543,3 @@ def execute( ), with_totals=with_totals, ) - - -class NativeDriverReader(ClickhouseReader): - """Reader that executes queries over the native protocol (ClickhouseNativePool).""" diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index bbbd09dc309..6ba7d9ab62c 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -20,7 +20,11 @@ from snuba import settings, state from snuba.clickhouse.http import HTTPBatchWriter, InsertStatement, JSONRow -from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool +from snuba.clickhouse.native import ( + ClickhouseNativePool, + ClickhousePool, + ClickhouseReader, +) from snuba.clusters.storage_sets import ( DEV_STORAGE_SETS, StorageSetKey, @@ -387,28 +391,29 @@ def get_node_connection( def get_deleter(self) -> Reader: # we need the connection to the storage nodes, not the distributed # nodes. The node lookup is cached (it can run a system.clusters query - # on multi-node clusters) while the connection/reader are resolved per - # call so the driver can still switch at runtime. The pool builds its - # own matching reader, so the reader and the pool always agree. + # on multi-node clusters) while the connection is resolved per call so + # the driver can still switch at runtime. if self.__delete_local_node is None: self.__delete_local_node = self.get_local_nodes()[0] - return self.get_node_connection( - ClickhouseClientSettings.DELETE, self.__delete_local_node - ).get_reader( - f"{self.__cache_partition_id}_deletes", - self.__query_settings_prefix, + return ClickhouseReader( + cache_partition_id=f"{self.__cache_partition_id}_deletes", + client=self.get_node_connection( + ClickhouseClientSettings.DELETE, self.__delete_local_node + ), + query_settings_prefix=self.__query_settings_prefix, ) def get_reader(self) -> Reader: """ - Return a reader for the query node. The connection (native or HTTP) is - selected from the ``use_clickhouse_connect_driver`` runtime config and - builds its own matching reader, so the driver can be switched at runtime - and the reader and pool can never disagree. + Return a reader for the query node. The driver-agnostic ClickhouseReader + wraps whichever pool (native or HTTP) get_query_connection selects from + the ``use_clickhouse_connect_driver`` runtime config, so the driver can + be switched at runtime. """ - return self.get_query_connection(ClickhouseClientSettings.QUERY).get_reader( - self.__cache_partition_id, - self.__query_settings_prefix, + return ClickhouseReader( + cache_partition_id=self.__cache_partition_id, + client=self.get_query_connection(ClickhouseClientSettings.QUERY), + query_settings_prefix=self.__query_settings_prefix, ) def get_batch_writer( diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index fee8745bd47..dcedb0b2a67 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -3,9 +3,8 @@ import pytest -from snuba.clickhouse.connect import ClickhouseConnectPool, HTTPDriverReader +from snuba.clickhouse.connect import ClickhouseConnectPool from snuba.clickhouse.errors import ClickhouseError -from snuba.clickhouse.native import NativeDriverReader # Error code returned by ClickHouse when the maximum number of simultaneous # queries has been exceeded. @@ -219,12 +218,11 @@ def test_pool_size_runtime_override() -> None: assert kwargs["maxsize"] == 42 -def test_http_driver_reader_is_a_clickhouse_reader_sibling() -> None: - # HTTPDriverReader and NativeDriverReader share the ClickhouseReader base - # but are siblings: HTTPDriverReader is NOT a NativeDriverReader. +def test_clickhouse_reader_wraps_connect_pool() -> None: + # The single driver-agnostic ClickhouseReader wraps the abstract pool, so it + # works with the connect pool just like the native one. from snuba.clickhouse.native import ClickhouseReader pool = _make_pool(mock.Mock()) - reader = HTTPDriverReader(cache_partition_id=None, client=pool, query_settings_prefix=None) + reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) assert isinstance(reader, ClickhouseReader) - assert not isinstance(reader, NativeDriverReader) diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index f5a299a6f26..8dfc53a376b 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -262,10 +262,10 @@ def test_cache_connections() -> None: @pytest.mark.redis_db @pytest.mark.clickhouse_db -def test_get_reader_selects_driver() -> None: +def test_get_node_connection_selects_driver() -> None: from snuba import state - from snuba.clickhouse.connect import HTTPDriverReader - from snuba.clickhouse.native import NativeDriverReader + from snuba.clickhouse.connect import ClickhouseConnectPool + from snuba.clickhouse.native import ClickhouseNativePool, ClickhouseReader test_cluster = cluster.ClickhouseCluster( "127.0.0.1", @@ -281,20 +281,25 @@ def test_get_reader_selects_driver() -> None: True, ) - # Default: native driver. + # The driver is selected at the pool level; the reader is the single + # driver-agnostic ClickhouseReader regardless. + # Default: native pool. state.set_config("use_clickhouse_connect_driver", 0) - native_reader = test_cluster.get_reader() - assert isinstance(native_reader, NativeDriverReader) - assert not isinstance(native_reader, HTTPDriverReader) + native_pool = test_cluster.get_query_connection(cluster.ClickhouseClientSettings.QUERY) + assert isinstance(native_pool, ClickhouseNativePool) + assert isinstance(test_cluster.get_reader(), ClickhouseReader) - # Flip on at runtime: HTTP driver. + # Flip on at runtime: HTTP pool. state.set_config("use_clickhouse_connect_driver", 1) - http_reader = test_cluster.get_reader() - assert isinstance(http_reader, HTTPDriverReader) + http_pool = test_cluster.get_query_connection(cluster.ClickhouseClientSettings.QUERY) + assert isinstance(http_pool, ClickhouseConnectPool) - # Flip back: native driver again. + # Flip back: native pool again. state.set_config("use_clickhouse_connect_driver", 0) - assert not isinstance(test_cluster.get_reader(), HTTPDriverReader) + assert isinstance( + test_cluster.get_query_connection(cluster.ClickhouseClientSettings.QUERY), + ClickhouseNativePool, + ) @patch("snuba.settings.SLICED_CLUSTERS", SLICED_CLUSTERS_CONFIG) diff --git a/tests/web/test_cache_partitions.py b/tests/web/test_cache_partitions.py index f98b1be623c..27ace9ad806 100644 --- a/tests/web/test_cache_partitions.py +++ b/tests/web/test_cache_partitions.py @@ -1,22 +1,22 @@ import pytest -from snuba.clickhouse.native import ClickhouseNativePool, NativeDriverReader +from snuba.clickhouse.native import ClickhouseNativePool, ClickhouseReader from snuba.web.db_query import _get_cache_partition @pytest.mark.redis_db def test_cache_partition() -> None: pool = ClickhouseNativePool("127.0.0.1", 9000, "", "", "") - reader1 = NativeDriverReader(None, pool, None) - reader2 = NativeDriverReader(None, pool, None) + reader1 = ClickhouseReader(None, pool, None) + reader2 = ClickhouseReader(None, pool, None) default_cache = _get_cache_partition(reader1) another_default_cache = _get_cache_partition(reader2) assert id(default_cache) == id(another_default_cache) - reader3 = NativeDriverReader("non_default", pool, None) - reader4 = NativeDriverReader("non_default", pool, None) + reader3 = ClickhouseReader("non_default", pool, None) + reader4 = ClickhouseReader("non_default", pool, None) nondefault_cache = _get_cache_partition(reader3) another_nondefault_cache = _get_cache_partition(reader4) diff --git a/uv.lock b/uv.lock index 9712d097269..b27e451f94e 100644 --- a/uv.lock +++ b/uv.lock @@ -2,8 +2,10 @@ version = 1 revision = 3 requires-python = ">=3.13" resolution-markers = [ - "sys_platform == 'darwin'", - "sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version < '3.14' and sys_platform == 'darwin'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version < '3.14' and sys_platform == 'linux'", ] supported-markers = [ "sys_platform == 'darwin'", @@ -96,6 +98,25 @@ wheels = [ { url = "https://pypi.devinfra.sentry.io/wheels/click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28" }, ] +[[package]] +name = "clickhouse-connect" +version = "1.3.0" +source = { registry = "https://pypi.devinfra.sentry.io/simple" } +dependencies = [ + { name = "certifi", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "lz4", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "urllib3", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "zstandard", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9164b2d021eb4f8474767a301e196ffdb045ee960e9b68fc00c53b975ea10e79" }, + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2432a22aab181a483f3dc6b398c8f641099f2d71bb9289ec84de9643a3b96493" }, + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dee1b1d91258859260b7a2059fea87e1ee52f006e4a99a76ef21aa67a31face5" }, + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:225de4ab00609e599f2529a8a5256da5f473ce9544a04ab9b18b8fdd5baf9005" }, + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c18c0773a85f26c7eaeb59e0fd0a142e464312fda1c54fc7feae6115eb1759d4" }, + { url = "https://pypi.devinfra.sentry.io/wheels/clickhouse_connect-1.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61f2f32403ac23354a572b160fa0a51ad5e76ba88aea37ebe5371b8863659339" }, +] + [[package]] name = "clickhouse-driver" version = "0.2.10" @@ -486,6 +507,19 @@ wheels = [ { url = "https://pypi.devinfra.sentry.io/wheels/jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c" }, ] +[[package]] +name = "lz4" +version = "4.4.5" +source = { registry = "https://pypi.devinfra.sentry.io/simple" } +wheels = [ + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b424df1076e40d4e884cfcc4c77d815368b7fb9ebcd7e634f937725cd9a8a72a" }, + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:533298d208b58b651662dd972f52d807d48915176e5b032fb4f8c3b6f5fe535c" }, + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:451039b609b9a88a934800b5fc6ee401c89ad9c175abf2f4d9f8b2e4ef1afc64" }, + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c8e71b14938082ebaf78144f3b3917ac715f72d14c076f384a4c062df96f9df6" }, + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b84a42da86e8ad8537aabef062e7f661f4a877d1c74d65606c49d835d36d668" }, + { url = "https://pypi.devinfra.sentry.io/wheels/lz4-4.4.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bba042ec5a61fa77c7e380351a61cb768277801240249841defd2ff0a10742f" }, +] + [[package]] name = "markupsafe" version = "3.0.2" @@ -899,10 +933,10 @@ wheels = [ [[package]] name = "sentry-conventions" -version = "0.11.0" +version = "0.12.0" source = { registry = "https://pypi.devinfra.sentry.io/simple" } wheels = [ - { url = "https://pypi.devinfra.sentry.io/wheels/sentry_conventions-0.11.0-py3-none-any.whl", hash = "sha256:881581e4319620d1d798306a0a631fbc1265fab9581dc268401982999f70cc5c" }, + { url = "https://pypi.devinfra.sentry.io/wheels/sentry_conventions-0.12.0-py3-none-any.whl", hash = "sha256:1316f8b439b63a757f827c751bccdde69416136aa1854a65036661bd65c7f63e" }, ] [[package]] @@ -1044,6 +1078,7 @@ source = { editable = "." } dependencies = [ { name = "blinker", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "clickhouse-connect", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "clickhouse-driver", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "confluent-kafka", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "datadog", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1113,6 +1148,7 @@ dev = [ requires-dist = [ { name = "blinker", specifier = ">=1.9" }, { name = "click", specifier = ">=8.1.7" }, + { name = "clickhouse-connect", specifier = ">=0.8.0" }, { name = "clickhouse-driver", specifier = ">=0.2.10" }, { name = "confluent-kafka", specifier = ">=2.7.0" }, { name = "datadog", specifier = ">=0.49.1" }, @@ -1139,7 +1175,7 @@ requires-dist = [ { name = "redis", specifier = ">=4.5.4" }, { name = "rust-snuba", editable = "rust_snuba" }, { name = "sentry-arroyo", specifier = ">=2.39.2" }, - { name = "sentry-conventions", specifier = ">=0.11.0" }, + { name = "sentry-conventions", specifier = ">=0.12.0" }, { name = "sentry-kafka-schemas", specifier = ">=2.1.24" }, { name = "sentry-protos", specifier = ">=0.17.0" }, { name = "sentry-redis-tools", specifier = ">=0.5.1" }, @@ -1376,7 +1412,8 @@ name = "watchdog" version = "3.0.0" source = { registry = "https://pypi.devinfra.sentry.io/simple" } resolution-markers = [ - "sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version < '3.14' and sys_platform == 'linux'", ] wheels = [ { url = "https://pypi.devinfra.sentry.io/wheels/watchdog-3.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0e06ab8858a76e1219e68c7573dfeba9dd1c0219476c5a44d5333b01d7e1743a" }, @@ -1388,7 +1425,8 @@ name = "watchdog" version = "6.0.0" source = { registry = "https://pypi.devinfra.sentry.io/simple" } resolution-markers = [ - "sys_platform == 'darwin'", + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version < '3.14' and sys_platform == 'darwin'", ] wheels = [ { url = "https://pypi.devinfra.sentry.io/wheels/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134" }, @@ -1406,3 +1444,16 @@ dependencies = [ wheels = [ { url = "https://pypi.devinfra.sentry.io/wheels/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131" }, ] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.devinfra.sentry.io/simple" } +wheels = [ + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a1a4ae2dec3993a32247995bdfe367fc3266da832d82f8438c8570f989753de1" }, + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bfc4e20784722098822e3eee42b8e576b379ed72cca4a7cb856ae733e62192ea" }, + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8e735494da3db08694d26480f1493ad2cf86e99bdd53e8e9771b2752a5c0246a" }, + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:05df5136bc5a011f33cd25bc9f506e7426c0c9b3f9954f056831ce68f3b6689f" }, + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223415140608d0f0da010499eaa8ccdb9af210a543fac54bce15babbcfc78439" }, + { url = "https://pypi.devinfra.sentry.io/wheels/zstandard-0.25.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e09bb6252b6476d8d56100e8147b803befa9a12cea144bbe629dd508800d1ad0" }, +] From 69af5801ad3f4bf0630a8aa373efb487deac6883 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:15:24 +0000 Subject: [PATCH 20/29] fix(clickhouse): Include http_port in the connection cache key The HTTP (clickhouse-connect) pool connects on http_port rather than the native node.port, but http_port was not part of the ConnectionCache key. Two clusters sharing the same node, credentials and settings but using different HTTP ports could collide on the same cached pool and silently reuse the wrong port. Add http_port to the cache key (and CacheKey type). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clusters/cluster.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 6ba7d9ab62c..6aaf1b8daf2 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -196,6 +196,7 @@ def use_clickhouse_connect_driver() -> bool: bool, Optional[str], Optional[bool], + int, str, ] @@ -235,6 +236,11 @@ def get_node_connection( secure, ca_certs, verify, + # The HTTP pool connects on ``http_port`` rather than the + # native ``node.port``, so it must be part of the key to avoid + # two clusters sharing a node/credentials but using different + # HTTP ports from colliding on the same cached pool. + http_port, "http" if use_connect else "native", ) if cache_key not in self.__cache: From 830b6f1ebf25f43731ba60e816f7298bcfb1b1d3 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:31:13 +0000 Subject: [PATCH 21/29] refactor(clickhouse): Route by-host pool acquisition through ConnectionCache The admin _build_validated_pool helper and the cleanup/optimize/querylog CLIs constructed a ClickhouseNativePool directly, hardcoding the native driver and bypassing the runtime native-vs-connect selection that the cluster's connections go through. Funnel all of them through ConnectionCache.get_node_connection so the driver is chosen by the use_clickhouse_connect_driver runtime config and every caller only ever handles the abstract ClickhousePool type. The cache now accepts an optional max_pool_size (added to the cache key) so admin can keep its pool of 2; the OPTIMIZE/CLEANUP timeouts ride along via the client settings profile the cache already reads. The admin EAP-488 guard now treats both direct pool construction and cache acquisition (get_node_connection) as "acquiring a pool", so _validate_node must still run first; its tests are updated accordingly. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/common.py | 38 ++++++++++------ snuba/cli/cleanup.py | 24 ++++++++--- snuba/cli/optimize.py | 26 +++++++---- snuba/cli/querylog_to_csv.py | 31 ++++++++++---- snuba/clusters/cluster.py | 15 +++++++ tests/admin/test_system_queries.py | 69 ++++++++++++++++++------------ 6 files changed, 140 insertions(+), 63 deletions(-) diff --git a/snuba/admin/clickhouse/common.py b/snuba/admin/clickhouse/common.py index e43975cd8e7..1aa003d3259 100644 --- a/snuba/admin/clickhouse/common.py +++ b/snuba/admin/clickhouse/common.py @@ -6,8 +6,14 @@ from sql_metadata import Parser, QueryType # type: ignore from snuba import settings -from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool -from snuba.clusters.cluster import ClickhouseClientSettings, ClickhouseCluster +from snuba.clickhouse.native import ClickhousePool +from snuba.clusters.cluster import ( + ClickhouseClientSettings, + ClickhouseCluster, + ClickhouseNode, + connection_cache, + use_clickhouse_connect_driver, +) from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.storages.factory import get_storage from snuba.datasets.storages.storage_key import StorageKey @@ -89,21 +95,29 @@ def _build_validated_pool( password: str, client_settings: ClickhouseClientSettings, ) -> ClickhousePool: - # Single chokepoint for admin ClickhousePool construction. ClickhousePool - # ships the user/password in the first hello packet of the native protocol, - # so an unvalidated host means credentials reach whatever listener answers. - # All admin helpers must go through here — never call ClickhousePool - # directly from this module. The regression test - # test_no_direct_clickhouse_pool_construction_in_admin enforces this. + # Single chokepoint for admin ClickhousePool acquisition. A pool ships the + # user/password to the node (the native protocol's first hello packet, or + # the HTTP auth header), so an unvalidated host means credentials reach + # whatever listener answers. All admin helpers must go through here — never + # acquire a pool from the connection cache directly in this module. The + # regression test test_no_direct_clickhouse_pool_construction_in_admin + # enforces this. _validate_node(clickhouse_host, clickhouse_port, cluster, storage_name) - return ClickhouseNativePool( - clickhouse_host, - clickhouse_port, + # Go through the shared connection cache so the driver (native vs + # clickhouse-connect/HTTP) is selected by the runtime config, behind the + # abstract ClickhousePool type, just like the cluster's own connections. + return connection_cache.get_node_connection( + client_settings, + ClickhouseNode(clickhouse_host, clickhouse_port), username, password, database, + secure=False, + ca_certs=None, + verify=False, + http_port=cluster.get_http_port(), + use_connect=use_clickhouse_connect_driver(), max_pool_size=2, - client_settings=client_settings.value.settings, ) diff --git a/snuba/cli/cleanup.py b/snuba/cli/cleanup.py index 711c3b68516..3157b4fd6ba 100644 --- a/snuba/cli/cleanup.py +++ b/snuba/cli/cleanup.py @@ -69,7 +69,12 @@ def cleanup( setup_sentry() from snuba.cleanup import logger, run_cleanup - from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool + from snuba.clickhouse.native import ClickhousePool + from snuba.clusters.cluster import ( + ClickhouseNode, + connection_cache, + use_clickhouse_connect_driver, + ) storage = get_writable_storage(StorageKey(storage_name)) @@ -83,15 +88,20 @@ def cleanup( connection: ClickhousePool if clickhouse_host and clickhouse_port: - connection = ClickhouseNativePool( - clickhouse_host, - clickhouse_port, + # Go through the shared connection cache so the driver (native vs + # clickhouse-connect/HTTP) is selected by the runtime config, behind + # the abstract ClickhousePool type. + connection = connection_cache.get_node_connection( + ClickhouseClientSettings.CLEANUP, + ClickhouseNode(clickhouse_host, clickhouse_port), clickhouse_user, clickhouse_password, database, - clickhouse_secure, - clickhouse_ca_certs, - clickhouse_verify, + secure=clickhouse_secure, + ca_certs=clickhouse_ca_certs, + verify=clickhouse_verify, + http_port=cluster.get_http_port(), + use_connect=use_clickhouse_connect_driver(), ) elif not cluster.is_single_node(): raise click.ClickException("Provide ClickHouse host and port for cleanup") diff --git a/snuba/cli/optimize.py b/snuba/cli/optimize.py index 57dfe9a3dd7..78a38d5bc47 100644 --- a/snuba/cli/optimize.py +++ b/snuba/cli/optimize.py @@ -78,8 +78,13 @@ def optimize( ) -> None: from datetime import datetime - from snuba.clickhouse.native import ClickhouseNativePool, ClickhousePool + from snuba.clickhouse.native import ClickhousePool from snuba.clickhouse.optimize.optimize import logger + from snuba.clusters.cluster import ( + ClickhouseNode, + connection_cache, + use_clickhouse_connect_driver, + ) setup_logging(log_level) setup_sentry() @@ -102,16 +107,21 @@ def optimize( # that cluster. connection: ClickhousePool if clickhouse_host and clickhouse_port: - connection = ClickhouseNativePool( - clickhouse_host, - clickhouse_port, + # Go through the shared connection cache so the driver (native vs + # clickhouse-connect/HTTP) is selected by the runtime config, behind + # the abstract ClickhousePool type. The OPTIMIZE timeout is carried by + # the client settings profile the cache reads. + connection = connection_cache.get_node_connection( + ClickhouseClientSettings.OPTIMIZE, + ClickhouseNode(clickhouse_host, clickhouse_port), clickhouse_user, clickhouse_password, database, - clickhouse_secure, - clickhouse_ca_certs, - clickhouse_verify, - send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, + secure=clickhouse_secure, + ca_certs=clickhouse_ca_certs, + verify=clickhouse_verify, + http_port=storage.get_cluster().get_http_port(), + use_connect=use_clickhouse_connect_driver(), ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") diff --git a/snuba/cli/querylog_to_csv.py b/snuba/cli/querylog_to_csv.py index 9f1d3cd425f..673df73a742 100644 --- a/snuba/cli/querylog_to_csv.py +++ b/snuba/cli/querylog_to_csv.py @@ -7,8 +7,13 @@ from snuba import settings from snuba.admin.notifications.slack.client import SlackClient -from snuba.clickhouse.native import ClickhouseNativePool -from snuba.clusters.cluster import ClickhouseClientSettings +from snuba.clusters.cluster import ( + DEFAULT_CLICKHOUSE_HTTP_PORT, + ClickhouseClientSettings, + ClickhouseNode, + connection_cache, + use_clickhouse_connect_driver, +) from snuba.environment import setup_logging, setup_sentry logger = structlog.get_logger().bind(module=__name__) @@ -160,13 +165,21 @@ def querylog_to_csv( query = get_query_results(event_type, [database], tables, start_time, end_time) (clickhouse_user, clickhouse_password) = get_credentials() - connection = ClickhouseNativePool( - host=clickhouse_host, - port=clickhouse_port, - user=clickhouse_user, - password=clickhouse_password, - database=database, - client_settings=ClickhouseClientSettings.QUERY.value.settings, + # Go through the shared connection cache so the driver (native vs + # clickhouse-connect/HTTP) is selected by the runtime config, behind the + # abstract ClickhousePool type. There is no cluster here to read an + # http_port from, so fall back to the well-known default. + connection = connection_cache.get_node_connection( + ClickhouseClientSettings.QUERY, + ClickhouseNode(clickhouse_host, clickhouse_port), + clickhouse_user, + clickhouse_password, + database, + secure=False, + ca_certs=None, + verify=False, + http_port=DEFAULT_CLICKHOUSE_HTTP_PORT, + use_connect=use_clickhouse_connect_driver(), ) results = connection.execute(query) filename = format_filename(table) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 6aaf1b8daf2..1b41aeb6d13 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -37,6 +37,11 @@ logger = structlog.get_logger().bind(module=__name__) +# Well-known default ClickHouse HTTP port, used by by-host helpers (e.g. CLI +# tools) that only know a node's native address and have no cluster config to +# read an http_port from. +DEFAULT_CLICKHOUSE_HTTP_PORT = 8123 + class ClickhouseClientSettingsType(NamedTuple): settings: Mapping[str, Any] @@ -197,6 +202,7 @@ def use_clickhouse_connect_driver() -> bool: Optional[str], Optional[bool], int, + int, str, ] @@ -218,12 +224,18 @@ def get_node_connection( verify: Optional[bool], http_port: int, use_connect: bool, + max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, ) -> ClickhousePool: """ Return a cached connection pool for the node, typed as the abstract :class:`ClickhousePool`. When ``use_connect`` is set the clickhouse-connect (HTTP) pool is built, otherwise the native one. Both variants are cached side by side (the driver is part of the cache key). + + This is the single place pools are instantiated, so every caller — the + cluster query/node connections as well as the admin and CLI by-host + helpers — goes through it and gets the runtime-selected driver behind + the abstract :class:`ClickhousePool` type. """ with self.__lock: client_settings_dict, timeout = client_settings.value @@ -241,6 +253,7 @@ def get_node_connection( # two clusters sharing a node/credentials but using different # HTTP ports from colliding on the same cached pool. http_port, + max_pool_size, "http" if use_connect else "native", ) if cache_key not in self.__cache: @@ -261,6 +274,7 @@ def get_node_connection( secure=secure, ca_certs=ca_certs, verify=verify, + max_pool_size=max_pool_size, ) else: pool = ClickhouseNativePool( @@ -274,6 +288,7 @@ def get_node_connection( secure=secure, ca_certs=ca_certs, verify=verify, + max_pool_size=max_pool_size, ) self.__cache[cache_key] = pool diff --git a/tests/admin/test_system_queries.py b/tests/admin/test_system_queries.py index ba40adc5834..af989d30f38 100644 --- a/tests/admin/test_system_queries.py +++ b/tests/admin/test_system_queries.py @@ -299,12 +299,12 @@ def test_clusterless_rejects_unvalidated_host( helper_name: str, client_settings: ClickhouseClientSettings ) -> None: """ - Regression for EAP-488: the clusterless helpers used to construct a + Regression for EAP-488: the clusterless helpers used to acquire a ClickhousePool against any attacker-supplied host/port, which leaked the - configured ClickHouse user/password in the first hello packet of the - native protocol. Both helpers must now call _validate_node before - constructing the pool, so an invalid host produces InvalidNodeError and - no credentials ever leave the process. + configured ClickHouse user/password to the node (the native protocol's + first hello packet, or the HTTP auth header). Both helpers must now call + _validate_node before acquiring the pool, so an invalid host produces + InvalidNodeError and no credentials ever leave the process. """ from snuba.admin.clickhouse import common @@ -316,7 +316,7 @@ def test_clusterless_rejects_unvalidated_host( "_validate_node", side_effect=InvalidNodeError("host not in cluster"), ) as mock_validate, - patch.object(common, "ClickhouseNativePool") as mock_pool, + patch.object(common.connection_cache, "get_node_connection") as mock_pool, ): # Clear any cached connection for this storage so the cache lookup # can't short-circuit validation. @@ -326,9 +326,9 @@ def test_clusterless_rejects_unvalidated_host( with pytest.raises(InvalidNodeError): helper("attacker.example.com", 9009, "errors", client_settings) - assert mock_validate.called, "_validate_node must run before pool construction" + assert mock_validate.called, "_validate_node must run before pool acquisition" assert not mock_pool.called, ( - "ClickhousePool must not be constructed for an unvalidated host — " + "A pool must not be acquired for an unvalidated host — " "doing so would transmit ClickHouse credentials to the attacker" ) @@ -385,11 +385,24 @@ def test_sudo_mode_skips_experimental_analyzer(sql_query: str, sudo_mode: bool) ) +# Names that acquire a ClickhousePool (and therefore send credentials to a +# node): the connection cache accessor and direct construction of either pool +# implementation. Any of these in admin code must sit behind _validate_node. +_POOL_ACQUISITION_NAMES = frozenset( + { + "get_node_connection", + "ClickhouseNativePool", + "ClickhouseConnectPool", + } +) + + def _find_clickhouse_pool_calls(tree: ast.AST) -> list[tuple[ast.Call, list[str]]]: """ - Walks an AST and returns every `ClickhousePool(...)` call site, paired - with the chain of enclosing function names (outermost first) so the - regression guard below can assert *where* construction happens. + Walks an AST and returns every pool-acquisition call site (cache accessor + or direct pool construction), paired with the chain of enclosing function + names (outermost first) so the regression guard below can assert *where* + acquisition happens. """ results: list[tuple[ast.Call, list[str]]] = [] @@ -416,7 +429,7 @@ def visit_Call(self, node: ast.Call) -> None: if isinstance(func, ast.Attribute) else None ) - if name == "ClickhouseNativePool": + if name in _POOL_ACQUISITION_NAMES: results.append((node, list(self.scope))) self.generic_visit(node) @@ -426,19 +439,21 @@ def visit_Call(self, node: ast.Call) -> None: def test_no_direct_clickhouse_pool_construction_in_admin() -> None: """ - Defense-in-depth for EAP-488: ClickhousePool ships the configured - user/password in the first hello packet of the native protocol, so any - admin code path that constructs one against a caller-supplied host - leaks credentials to whatever listener answers. `_build_validated_pool` - in snuba/admin/clickhouse/common.py is the single chokepoint that runs - `_validate_node` first — every other admin module must go through it. - - This test enforces that structural invariant by AST-walking every - snuba/admin/**/*.py file: - - * common.py may only construct ClickhousePool from inside - `_build_validated_pool`. - * No other admin module may construct ClickhousePool at all. + Defense-in-depth for EAP-488: a ClickhousePool ships the configured + user/password to the node (the native protocol's first hello packet, or + the HTTP auth header), so any admin code path that acquires one against a + caller-supplied host leaks credentials to whatever listener answers. + `_build_validated_pool` in snuba/admin/clickhouse/common.py is the single + chokepoint that runs `_validate_node` first — every other admin module + must go through it. + + "Acquiring" a pool means either constructing one of the pool + implementations directly or fetching one from the shared connection cache + via `get_node_connection`. This test enforces that structural invariant by + AST-walking every snuba/admin/**/*.py file: + + * common.py may only acquire a pool from inside `_build_validated_pool`. + * No other admin module may acquire a pool at all. If this fails, a new caller has likely re-introduced the vulnerability. """ @@ -456,13 +471,13 @@ def test_no_direct_clickhouse_pool_construction_in_admin() -> None: if py_file == common_path: if scope != ["_build_validated_pool"]: offenders.append( - f"{location} constructs ClickhousePool inside " + f"{location} acquires a ClickhousePool inside " f"{'.'.join(scope) or ''} — must be inside " "_build_validated_pool so _validate_node runs first." ) else: offenders.append( - f"{location} constructs ClickhousePool directly — call " + f"{location} acquires a ClickhousePool directly — call " "_build_validated_pool (or a helper that wraps it) so " "_validate_node guards the host before credentials are sent." ) From 348135c5e31bdfda922b673759811bca2cbf2305 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:40:01 +0000 Subject: [PATCH 22/29] fix(clickhouse): Single runtime-sized connect pool; driver-aware admin cache Pool sizing is no longer a per-caller concern: the connect pool always takes its size from the clickhouse_connect_pool_size runtime config (falling back to CLICKHOUSE_MAX_POOL_SIZE), so it is tunable at runtime and there is just one shared pool per connection, instantiated by ConnectionCache. Drop the max_pool_size plumbing added to ConnectionCache and the admin pool-of-2 override, and remove the now-unused max_pool_size constructor arg from ClickhouseConnectPool. Make the admin connection caches (NODE_CONNECTIONS / CLUSTER_CONNECTIONS) driver-aware: now that admin pools are driver-selected, the cache keys must include the driver token, otherwise flipping use_clickhouse_connect_driver would keep returning a pool pinned to the previously cached driver until restart. Fixes the regression flagged on the admin helpers. querylog_to_csv reads CLICKHOUSE_HTTP_PORT (the same env the cluster config uses) instead of hardcoding the default HTTP port. Add a focused test asserting WITH TOTALS works over the HTTP driver: the connect pool surfaces the totals block as the trailing row and the driver-agnostic ClickhouseReader splits it out as totals. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/common.py | 25 +++++++++++++++-------- snuba/cli/querylog_to_csv.py | 7 +++++-- snuba/clickhouse/connect.py | 20 ++++++++++-------- snuba/clusters/cluster.py | 11 ++++------ tests/clickhouse/test_connect.py | 35 ++++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 25 deletions(-) diff --git a/snuba/admin/clickhouse/common.py b/snuba/admin/clickhouse/common.py index 1aa003d3259..066433055b4 100644 --- a/snuba/admin/clickhouse/common.py +++ b/snuba/admin/clickhouse/common.py @@ -117,10 +117,18 @@ def _build_validated_pool( verify=False, http_port=cluster.get_http_port(), use_connect=use_clickhouse_connect_driver(), - max_pool_size=2, ) +def _driver_cache_token() -> str: + # Part of the admin connection cache keys so that flipping the + # use_clickhouse_connect_driver runtime flag re-resolves admin connections + # to the new driver, instead of returning a pool pinned to whichever driver + # was active when the entry was first cached. This keeps admin traffic + # switchable at runtime, like the cluster query/reader paths. + return "connect" if use_clickhouse_connect_driver() else "native" + + def get_ro_node_connection( clickhouse_host: str, clickhouse_port: int, @@ -129,7 +137,7 @@ def get_ro_node_connection( ) -> ClickhousePool: storage = _get_storage(storage_name) - key = f"{storage.get_storage_key()}-{clickhouse_host}" + key = f"{storage.get_storage_key()}-{clickhouse_host}-{_driver_cache_token()}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] @@ -176,8 +184,9 @@ def get_ro_node_connection( def get_ro_query_node_connection( storage_name: str, client_settings: ClickhouseClientSettings ) -> ClickhousePool: - if storage_name in CLUSTER_CONNECTIONS: - return CLUSTER_CONNECTIONS[storage_name] + key = f"{storage_name}-{_driver_cache_token()}" + if key in CLUSTER_CONNECTIONS: + return CLUSTER_CONNECTIONS[key] storage = _get_storage(storage_name) cluster = storage.get_cluster() @@ -186,7 +195,7 @@ def get_ro_query_node_connection( connection_id.hostname, connection_id.tcp_port, storage_name, client_settings ) - CLUSTER_CONNECTIONS[storage_name] = connection + CLUSTER_CONNECTIONS[key] = connection return connection @@ -198,7 +207,7 @@ def get_sudo_node_connection( ) -> ClickhousePool: storage = _get_storage(storage_name) - key = f"{storage.get_storage_key()}-{clickhouse_host}-sudo" + key = f"{storage.get_storage_key()}-{clickhouse_host}-sudo-{_driver_cache_token()}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] @@ -230,7 +239,7 @@ def get_clusterless_node_connection( cluster = storage.get_cluster() database = cluster.get_database() - key = f"{storage.get_storage_key()}-{clickhouse_host}-clusterless-{database}" + key = f"{storage.get_storage_key()}-{clickhouse_host}-clusterless-{database}-{_driver_cache_token()}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] @@ -259,7 +268,7 @@ def get_ro_clusterless_node_connection( cluster = storage.get_cluster() database = cluster.get_database() - key = f"{storage.get_storage_key()}-{clickhouse_host}-clusterless-ro-{database}" + key = f"{storage.get_storage_key()}-{clickhouse_host}-clusterless-ro-{database}-{_driver_cache_token()}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] diff --git a/snuba/cli/querylog_to_csv.py b/snuba/cli/querylog_to_csv.py index 673df73a742..565d0d3a721 100644 --- a/snuba/cli/querylog_to_csv.py +++ b/snuba/cli/querylog_to_csv.py @@ -1,4 +1,5 @@ import csv +import os from datetime import datetime from typing import NamedTuple, Optional, Sequence, Tuple @@ -168,7 +169,9 @@ def querylog_to_csv( # Go through the shared connection cache so the driver (native vs # clickhouse-connect/HTTP) is selected by the runtime config, behind the # abstract ClickhousePool type. There is no cluster here to read an - # http_port from, so fall back to the well-known default. + # http_port from, so use the configured CLICKHOUSE_HTTP_PORT (the same env + # var the cluster config reads), defaulting to the well-known port. + http_port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", DEFAULT_CLICKHOUSE_HTTP_PORT)) connection = connection_cache.get_node_connection( ClickhouseClientSettings.QUERY, ClickhouseNode(clickhouse_host, clickhouse_port), @@ -178,7 +181,7 @@ def querylog_to_csv( secure=False, ca_certs=None, verify=False, - http_port=DEFAULT_CLICKHOUSE_HTTP_PORT, + http_port=http_port, use_connect=use_clickhouse_connect_driver(), ) results = connection.execute(query) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 3046ce10b88..f312cee9ddd 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -69,12 +69,14 @@ def __init__( verify: Optional[bool] = False, connect_timeout: int = 1, send_receive_timeout: Optional[int] = 35, - max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, client_settings: Mapping[str, Any] = {}, ) -> None: # No native connection queue here; clickhouse-connect manages its own # HTTP pool. ``port`` is the abstract base attribute (it holds the - # cluster's configured HTTP port for this driver). + # cluster's configured HTTP port for this driver). The pool size is not + # a construction parameter: it is always taken from the + # ``clickhouse_connect_pool_size`` runtime config (see _get_client), so + # it can be tuned at runtime without rebuilding pools. self.host = host self.port = http_port self.user = user @@ -85,7 +87,6 @@ def __init__( self.verify = verify self.connect_timeout = connect_timeout self.send_receive_timeout = send_receive_timeout - self.max_pool_size = max_pool_size self.client_settings = client_settings self.__client: Optional[Client] = None @@ -97,12 +98,15 @@ def _get_client(self) -> Client: if self.__client is None: with self.__lock: if self.__client is None: - # Default to the configured CLICKHOUSE_MAX_POOL_SIZE, but - # allow it to be overridden at runtime. The value is read - # once when the (cached) client is first created. + # Pool size always comes from the clickhouse_connect_pool_size + # runtime config, falling back to the configured + # CLICKHOUSE_MAX_POOL_SIZE. The value is read once, when the + # (cached) client is first created. pool_size = ( - state.get_int_config("clickhouse_connect_pool_size", self.max_pool_size) - or self.max_pool_size + state.get_int_config( + "clickhouse_connect_pool_size", settings.CLICKHOUSE_MAX_POOL_SIZE + ) + or settings.CLICKHOUSE_MAX_POOL_SIZE ) pool_mgr = get_pool_manager( ca_cert=self.ca_certs, diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 1b41aeb6d13..9d17305ccd7 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -202,7 +202,6 @@ def use_clickhouse_connect_driver() -> bool: Optional[str], Optional[bool], int, - int, str, ] @@ -224,7 +223,6 @@ def get_node_connection( verify: Optional[bool], http_port: int, use_connect: bool, - max_pool_size: int = settings.CLICKHOUSE_MAX_POOL_SIZE, ) -> ClickhousePool: """ Return a cached connection pool for the node, typed as the abstract @@ -234,8 +232,10 @@ def get_node_connection( This is the single place pools are instantiated, so every caller — the cluster query/node connections as well as the admin and CLI by-host - helpers — goes through it and gets the runtime-selected driver behind - the abstract :class:`ClickhousePool` type. + helpers — goes through it and gets one shared, runtime-selected pool + behind the abstract :class:`ClickhousePool` type. Pool sizing is left to + the pools themselves (the connect pool reads the + ``clickhouse_connect_pool_size`` runtime config). """ with self.__lock: client_settings_dict, timeout = client_settings.value @@ -253,7 +253,6 @@ def get_node_connection( # two clusters sharing a node/credentials but using different # HTTP ports from colliding on the same cached pool. http_port, - max_pool_size, "http" if use_connect else "native", ) if cache_key not in self.__cache: @@ -274,7 +273,6 @@ def get_node_connection( secure=secure, ca_certs=ca_certs, verify=verify, - max_pool_size=max_pool_size, ) else: pool = ClickhouseNativePool( @@ -288,7 +286,6 @@ def get_node_connection( secure=secure, ca_certs=ca_certs, verify=verify, - max_pool_size=max_pool_size, ) self.__cache[cache_key] = pool diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index dcedb0b2a67..564b52c2421 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -226,3 +226,38 @@ def test_clickhouse_reader_wraps_connect_pool() -> None: pool = _make_pool(mock.Mock()) reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) assert isinstance(reader, ClickhouseReader) + + +def test_with_totals_handled_over_http() -> None: + # WITH TOTALS works on the HTTP driver through clickhouse-connect's own + # parsing: its Native-format reader concatenates every response block, + # including the trailing totals block, into result_set, so the totals row + # arrives last — exactly what the driver-agnostic ClickhouseReader expects + # when it pops the last row as "totals". No native-driver-specific handling + # is involved. + from snuba.clickhouse.native import ClickhouseReader + + class FakeFormattedQuery: + def get_sql(self) -> str: + return "SELECT project_id, count() FROM t GROUP BY project_id WITH TOTALS" + + client = mock.Mock() + # Two data rows followed by the totals row, the way clickhouse-connect + # surfaces a WITH TOTALS response over HTTP. + client.query.return_value = FakeQueryResult( + result_set=[[1, 10], [2, 20], [0, 30]], + column_names=("project_id", "count()"), + column_types=(FakeColumnType("UInt64"), FakeColumnType("UInt64")), + ) + + pool = _make_pool(client) + reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) + + result = reader.execute(FakeFormattedQuery(), with_totals=True) + + # The trailing row is split out as totals; only the real rows remain in data. + assert result["data"] == [ + {"project_id": 1, "count()": 10}, + {"project_id": 2, "count()": 20}, + ] + assert result["totals"] == {"project_id": 0, "count()": 30} From b1445167def88bc2042b60b873c65a4d581f076f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:47:06 +0000 Subject: [PATCH 23/29] refactor(clickhouse): Decide driver inside ConnectionCache; lock pool close ConnectionCache.get_node_connection no longer takes a use_connect parameter: it reads the use_clickhouse_connect_driver runtime flag itself, so the driver decision lives in exactly one place. Callers (cluster, admin helper, the cleanup/optimize/querylog CLIs) drop the use_connect=use_clickhouse_connect_driver() boilerplate, and the now-dead use_connect override param is removed from the test FakeClickhouseCluster. Also take the pool's lock in ClickhouseConnectPool.close() so teardown can't race with the lazy client init in _get_client(). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/common.py | 1 - snuba/cli/cleanup.py | 7 +------ snuba/cli/optimize.py | 7 +------ snuba/cli/querylog_to_csv.py | 2 -- snuba/clickhouse/connect.py | 10 +++++++--- snuba/clusters/cluster.py | 28 ++++++++++++++-------------- tests/clusters/fake_cluster.py | 1 - 7 files changed, 23 insertions(+), 33 deletions(-) diff --git a/snuba/admin/clickhouse/common.py b/snuba/admin/clickhouse/common.py index 066433055b4..33a2e826c84 100644 --- a/snuba/admin/clickhouse/common.py +++ b/snuba/admin/clickhouse/common.py @@ -116,7 +116,6 @@ def _build_validated_pool( ca_certs=None, verify=False, http_port=cluster.get_http_port(), - use_connect=use_clickhouse_connect_driver(), ) diff --git a/snuba/cli/cleanup.py b/snuba/cli/cleanup.py index 3157b4fd6ba..efbad46508b 100644 --- a/snuba/cli/cleanup.py +++ b/snuba/cli/cleanup.py @@ -70,11 +70,7 @@ def cleanup( from snuba.cleanup import logger, run_cleanup from snuba.clickhouse.native import ClickhousePool - from snuba.clusters.cluster import ( - ClickhouseNode, - connection_cache, - use_clickhouse_connect_driver, - ) + from snuba.clusters.cluster import ClickhouseNode, connection_cache storage = get_writable_storage(StorageKey(storage_name)) @@ -101,7 +97,6 @@ def cleanup( ca_certs=clickhouse_ca_certs, verify=clickhouse_verify, http_port=cluster.get_http_port(), - use_connect=use_clickhouse_connect_driver(), ) elif not cluster.is_single_node(): raise click.ClickException("Provide ClickHouse host and port for cleanup") diff --git a/snuba/cli/optimize.py b/snuba/cli/optimize.py index 78a38d5bc47..dbbd0311415 100644 --- a/snuba/cli/optimize.py +++ b/snuba/cli/optimize.py @@ -80,11 +80,7 @@ def optimize( from snuba.clickhouse.native import ClickhousePool from snuba.clickhouse.optimize.optimize import logger - from snuba.clusters.cluster import ( - ClickhouseNode, - connection_cache, - use_clickhouse_connect_driver, - ) + from snuba.clusters.cluster import ClickhouseNode, connection_cache setup_logging(log_level) setup_sentry() @@ -121,7 +117,6 @@ def optimize( ca_certs=clickhouse_ca_certs, verify=clickhouse_verify, http_port=storage.get_cluster().get_http_port(), - use_connect=use_clickhouse_connect_driver(), ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") diff --git a/snuba/cli/querylog_to_csv.py b/snuba/cli/querylog_to_csv.py index 565d0d3a721..0db0176f59a 100644 --- a/snuba/cli/querylog_to_csv.py +++ b/snuba/cli/querylog_to_csv.py @@ -13,7 +13,6 @@ ClickhouseClientSettings, ClickhouseNode, connection_cache, - use_clickhouse_connect_driver, ) from snuba.environment import setup_logging, setup_sentry @@ -182,7 +181,6 @@ def querylog_to_csv( ca_certs=None, verify=False, http_port=http_port, - use_connect=use_clickhouse_connect_driver(), ) results = connection.execute(query) filename = format_filename(table) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index f312cee9ddd..8484e92f5ae 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -315,6 +315,10 @@ def execute_robust( ) def close(self) -> None: - if self.__client is not None: - self.__client.close() - self.__client = None + # Take the same lock _get_client uses so a concurrent lazy init can't + # race with teardown (one thread closing the client while another is + # creating or about to use it). + with self.__lock: + if self.__client is not None: + self.__client.close() + self.__client = None diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 9d17305ccd7..5628dc2a303 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -222,21 +222,23 @@ def get_node_connection( ca_certs: Optional[str], verify: Optional[bool], http_port: int, - use_connect: bool, ) -> ClickhousePool: """ Return a cached connection pool for the node, typed as the abstract - :class:`ClickhousePool`. When ``use_connect`` is set the + :class:`ClickhousePool`. The driver is decided here, from the + ``use_clickhouse_connect_driver`` runtime config: when it is enabled the clickhouse-connect (HTTP) pool is built, otherwise the native one. Both variants are cached side by side (the driver is part of the cache key). - This is the single place pools are instantiated, so every caller — the - cluster query/node connections as well as the admin and CLI by-host - helpers — goes through it and gets one shared, runtime-selected pool - behind the abstract :class:`ClickhousePool` type. Pool sizing is left to - the pools themselves (the connect pool reads the - ``clickhouse_connect_pool_size`` runtime config). + This is the single place pools are instantiated and the single place the + driver is selected, so every caller — the cluster query/node connections + as well as the admin and CLI by-host helpers — goes through it and gets + one shared, runtime-selected pool behind the abstract + :class:`ClickhousePool` type. Pool sizing is left to the pools themselves + (the connect pool reads the ``clickhouse_connect_pool_size`` runtime + config). """ + use_connect = use_clickhouse_connect_driver() with self.__lock: client_settings_dict, timeout = client_settings.value cache_key = ( @@ -387,11 +389,10 @@ def get_node_connection( connection to the same node with the same settings otherwise establish a new connection. - This is the single place where the driver is selected: when the - ``use_clickhouse_connect_driver`` runtime config is enabled the - clickhouse-connect (HTTP) pool is returned, otherwise the native pool. - The choice applies to every caller (reads, migrations, replacer, - optimize, ...), not just the read path. + The driver is selected inside ``ConnectionCache.get_node_connection`` + from the ``use_clickhouse_connect_driver`` runtime config (HTTP pool + when enabled, native otherwise). The choice applies to every caller + (reads, migrations, replacer, optimize, ...), not just the read path. """ return self.__connection_cache.get_node_connection( client_settings, @@ -403,7 +404,6 @@ def get_node_connection( self.__ca_certs, self.__verify, http_port=self.__http_port, - use_connect=use_clickhouse_connect_driver(), ) def get_deleter(self) -> Reader: diff --git a/tests/clusters/fake_cluster.py b/tests/clusters/fake_cluster.py index aaf92f01952..16b09404999 100644 --- a/tests/clusters/fake_cluster.py +++ b/tests/clusters/fake_cluster.py @@ -128,7 +128,6 @@ def get_node_connection( self, client_settings: ClickhouseClientSettings, node: ClickhouseNode, - use_connect: Optional[bool] = None, ) -> ClickhouseNativePool: settings, timeout = client_settings.value cache_key = (node, client_settings) From 4d788a1eccc1849aba3accb461b3594febbe65d3 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 12:52:38 +0000 Subject: [PATCH 24/29] test(clickhouse): Pin connect type-name compatibility with reader transforms clickhouse-connect's column_type.name returns the canonical ClickHouse type strings (identical to the native driver), so the reader's Date/DateTime/UUID regex transforms match identically on the HTTP path. Add an end-to-end test through the real connect pool + ClickhouseReader using clickhouse-connect's actual type names (including parametrized DateTime('UTC') and Nullable(UUID)) asserting the values are transformed, so a future format change would be caught instead of silently skipping transformations. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- tests/clickhouse/test_connect.py | 64 ++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 564b52c2421..68f2f21b83f 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -261,3 +261,67 @@ def get_sql(self) -> str: {"project_id": 2, "count()": 20}, ] assert result["totals"] == {"project_id": 0, "count()": 30} + + +def test_connect_type_names_drive_reader_transforms() -> None: + # The connect pool exposes clickhouse-connect's column_type.name in the + # result meta, and the driver-agnostic ClickhouseReader runs that through + # the same Date / DateTime / UUID regex transforms used for the native + # driver. This pins that clickhouse-connect's type-name format matches what + # those regexes expect (including parametrized types like DateTime('UTC') + # and Nullable(UUID)), so transformations are not silently skipped on the + # HTTP path. + from datetime import date, datetime + from uuid import UUID as UUIDClass + + from clickhouse_connect.datatypes.registry import get_from_name + + from snuba.clickhouse.native import ClickhouseReader + + class FakeFormattedQuery: + def get_sql(self) -> str: + return "SELECT d, dt, dt_tz, uid, nuid" + + # Column types named exactly as clickhouse-connect produces them. The + # assertion documents that clickhouse-connect uses the canonical ClickHouse + # type strings (the same the native driver returns), so the reader regexes + # match identically for both drivers. + col_types = [ + get_from_name("Date"), + get_from_name("DateTime"), + get_from_name("DateTime('UTC')"), + get_from_name("UUID"), + get_from_name("Nullable(UUID)"), + ] + assert [c.name for c in col_types] == [ + "Date", + "DateTime", + "DateTime('UTC')", + "UUID", + "Nullable(UUID)", + ] + + d = date(2023, 1, 2) + dt = datetime(2023, 1, 2, 3, 4, 5) + uid = UUIDClass("00000000-0000-0000-0000-000000000001") + + client = mock.Mock() + client.query.return_value = FakeQueryResult( + result_set=[[d, dt, dt, uid, uid]], + column_names=("d", "dt", "dt_tz", "uid", "nuid"), + column_types=tuple(col_types), + ) + + pool = _make_pool(client) + reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) + result = reader.execute(FakeFormattedQuery()) + + # Date/DateTime (incl. the parametrized tz variant) become ISO strings and + # UUID (incl. Nullable) becomes a string. Had the regexes failed to match + # the connect type names, these would still be the original objects. + row = result["data"][0] + assert row["d"] == "2023-01-02T00:00:00+00:00" + assert row["dt"] == "2023-01-02T03:04:05+00:00" + assert row["dt_tz"] == "2023-01-02T03:04:05+00:00" + assert row["uid"] == "00000000-0000-0000-0000-000000000001" + assert row["nuid"] == "00000000-0000-0000-0000-000000000001" From 3f7d05c30707a6c830145a8fddaa0e5706c66510 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 13:21:12 +0000 Subject: [PATCH 25/29] refactor(clickhouse): Make ClickhouseNode carry native_port and http_port Rename ClickhouseNode.port to native_port and add an optional http_port, so a node is self-describing for either driver. The connection cache no longer takes a separate http_port argument: it reads node.native_port for the native pool and node.http_port for the connect pool, and the node (which now includes http_port in its identity) subsumes the previous standalone http_port cache-key element. Cluster-produced nodes (the query node and those discovered via system.clusters, which only reports the native port) are stamped with the cluster's http_port. The admin/CLI by-host helpers pass http_port when constructing their node. Nodes built outside a cluster (tests, the replacer load balancer) leave http_port as None since they never open HTTP connections. Field accesses of node.port are updated to node.native_port across the cluster, admin, and migrations code; positional ClickhouseNode(...) constructions are unaffected by the rename. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/common.py | 7 ++-- snuba/admin/clickhouse/nodes.py | 4 +-- snuba/cli/cleanup.py | 5 +-- snuba/cli/optimize.py | 7 ++-- snuba/cli/querylog_to_csv.py | 3 +- snuba/clusters/cluster.py | 53 +++++++++++++++++++---------- snuba/migrations/check_dangerous.py | 4 ++- snuba/migrations/connect.py | 10 +++--- tests/clusters/test_cluster.py | 2 +- 9 files changed, 58 insertions(+), 37 deletions(-) diff --git a/snuba/admin/clickhouse/common.py b/snuba/admin/clickhouse/common.py index 33a2e826c84..d74ee6ea715 100644 --- a/snuba/admin/clickhouse/common.py +++ b/snuba/admin/clickhouse/common.py @@ -49,7 +49,7 @@ def is_valid_node(host: str, port: int, cluster: ClickhouseCluster, storage_name }, ) - return any(node.host_name == host and node.port == port for node in nodes) + return any(node.host_name == host and node.native_port == port for node in nodes) def _get_storage(storage_name: str) -> ReadableTableStorage: @@ -77,7 +77,7 @@ def _validate_node( "host": clickhouse_host, "port": clickhouse_port, "query_host": cluster.get_query_node().host_name, - "query_port": cluster.get_query_node().port, + "query_port": cluster.get_query_node().native_port, }, ) @@ -108,14 +108,13 @@ def _build_validated_pool( # abstract ClickhousePool type, just like the cluster's own connections. return connection_cache.get_node_connection( client_settings, - ClickhouseNode(clickhouse_host, clickhouse_port), + ClickhouseNode(clickhouse_host, clickhouse_port, http_port=cluster.get_http_port()), username, password, database, secure=False, ca_certs=None, verify=False, - http_port=cluster.get_http_port(), ) diff --git a/snuba/admin/clickhouse/nodes.py b/snuba/admin/clickhouse/nodes.py index 59a2f4b8536..9dd252ac735 100644 --- a/snuba/admin/clickhouse/nodes.py +++ b/snuba/admin/clickhouse/nodes.py @@ -48,7 +48,7 @@ def _get_nodes(storage_key: StorageKey, local: bool = True) -> Sequence[Node]: return [] else: return [ - {"host": node.host_name, "port": node.port} + {"host": node.host_name, "port": node.native_port} for node in ( cluster.get_local_nodes() if local else cluster.get_distributed_nodes() ) @@ -62,7 +62,7 @@ def _get_query_node(storage_key: StorageKey) -> Optional[Node]: try: cluster = get_storage(storage_key).get_cluster() query_node = cluster.get_query_node() - return {"host": query_node.host_name, "port": query_node.port} + return {"host": query_node.host_name, "port": query_node.native_port} except (AssertionError, KeyError, UndefinedClickhouseCluster): return None diff --git a/snuba/cli/cleanup.py b/snuba/cli/cleanup.py index efbad46508b..bd3d7f7d0c8 100644 --- a/snuba/cli/cleanup.py +++ b/snuba/cli/cleanup.py @@ -89,14 +89,15 @@ def cleanup( # the abstract ClickhousePool type. connection = connection_cache.get_node_connection( ClickhouseClientSettings.CLEANUP, - ClickhouseNode(clickhouse_host, clickhouse_port), + ClickhouseNode( + clickhouse_host, clickhouse_port, http_port=cluster.get_http_port() + ), clickhouse_user, clickhouse_password, database, secure=clickhouse_secure, ca_certs=clickhouse_ca_certs, verify=clickhouse_verify, - http_port=cluster.get_http_port(), ) elif not cluster.is_single_node(): raise click.ClickException("Provide ClickHouse host and port for cleanup") diff --git a/snuba/cli/optimize.py b/snuba/cli/optimize.py index dbbd0311415..4d42453d90f 100644 --- a/snuba/cli/optimize.py +++ b/snuba/cli/optimize.py @@ -109,14 +109,17 @@ def optimize( # the client settings profile the cache reads. connection = connection_cache.get_node_connection( ClickhouseClientSettings.OPTIMIZE, - ClickhouseNode(clickhouse_host, clickhouse_port), + ClickhouseNode( + clickhouse_host, + clickhouse_port, + http_port=storage.get_cluster().get_http_port(), + ), clickhouse_user, clickhouse_password, database, secure=clickhouse_secure, ca_certs=clickhouse_ca_certs, verify=clickhouse_verify, - http_port=storage.get_cluster().get_http_port(), ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") diff --git a/snuba/cli/querylog_to_csv.py b/snuba/cli/querylog_to_csv.py index 0db0176f59a..3d7c5ea43c5 100644 --- a/snuba/cli/querylog_to_csv.py +++ b/snuba/cli/querylog_to_csv.py @@ -173,14 +173,13 @@ def querylog_to_csv( http_port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", DEFAULT_CLICKHOUSE_HTTP_PORT)) connection = connection_cache.get_node_connection( ClickhouseClientSettings.QUERY, - ClickhouseNode(clickhouse_host, clickhouse_port), + ClickhouseNode(clickhouse_host, clickhouse_port, http_port=http_port), clickhouse_user, clickhouse_password, database, secure=False, ca_certs=None, verify=False, - http_port=http_port, ) results = connection.execute(query) filename = format_filename(table) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index 5628dc2a303..f2bb8d40f3c 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -114,12 +114,17 @@ class ClickhouseClientSettings(Enum): @dataclass(frozen=True) class ClickhouseNode: host_name: str - port: int + native_port: int shard: Optional[int] = None replica: Optional[int] = None + # The node's HTTP port, used by the clickhouse-connect (HTTP) driver. It is + # optional because nodes built outside a cluster context (e.g. in tests, or + # the replacer's load balancer, which never open HTTP connections) do not + # need one. Cluster-produced nodes always carry the cluster's HTTP port. + http_port: Optional[int] = None def __str__(self) -> str: - return f"{self.host_name}:{self.port}" + return f"{self.host_name}:{self.native_port}" class ClickhouseNodeType(Enum): @@ -191,7 +196,9 @@ def use_clickhouse_connect_driver() -> bool: # The driver discriminator is part of the cache key so that the native and the -# HTTP pool for the same node can be cached side by side. +# HTTP pool for the same node can be cached side by side. The node's HTTP port +# is part of ``ClickhouseNode`` itself, so it does not need a separate key +# element. CacheKey = Tuple[ ClickhouseNode, ClickhouseClientSettings, @@ -201,7 +208,6 @@ def use_clickhouse_connect_driver() -> bool: bool, Optional[str], Optional[bool], - int, str, ] @@ -221,14 +227,15 @@ def get_node_connection( secure: bool, ca_certs: Optional[str], verify: Optional[bool], - http_port: int, ) -> ClickhousePool: """ Return a cached connection pool for the node, typed as the abstract :class:`ClickhousePool`. The driver is decided here, from the ``use_clickhouse_connect_driver`` runtime config: when it is enabled the - clickhouse-connect (HTTP) pool is built, otherwise the native one. Both - variants are cached side by side (the driver is part of the cache key). + clickhouse-connect (HTTP) pool is built (connecting on the node's + ``http_port``), otherwise the native one (connecting on the node's + ``native_port``). Both variants are cached side by side (the driver is + part of the cache key). This is the single place pools are instantiated and the single place the driver is selected, so every caller — the cluster query/node connections @@ -250,11 +257,6 @@ def get_node_connection( secure, ca_certs, verify, - # The HTTP pool connects on ``http_port`` rather than the - # native ``node.port``, so it must be part of the key to avoid - # two clusters sharing a node/credentials but using different - # HTTP ports from colliding on the same cached pool. - http_port, "http" if use_connect else "native", ) if cache_key not in self.__cache: @@ -266,7 +268,14 @@ def get_node_connection( pool = ClickhouseConnectPool( host=node.host_name, - http_port=http_port, + # Fall back to the default HTTP port only for nodes that + # were built without one (e.g. by-host helpers that have + # no cluster http_port to draw on). + http_port=( + node.http_port + if node.http_port is not None + else DEFAULT_CLICKHOUSE_HTTP_PORT + ), user=user, password=password, database=database, @@ -279,7 +288,7 @@ def get_node_connection( else: pool = ClickhouseNativePool( node.host_name, - node.port, + node.native_port, user, password, database, @@ -343,7 +352,7 @@ def __init__( self.__port = port self.__max_connections = max_connections or _DEFAULT_MAX_CONNECTIONS self.__block_connections = block_connections - self.__query_node = ClickhouseNode(host, port) + self.__query_node = ClickhouseNode(host, port, http_port=http_port) self.__user = user self.__password = password self.__database = database @@ -403,7 +412,6 @@ def get_node_connection( self.__secure, self.__ca_certs, self.__verify, - http_port=self.__http_port, ) def get_deleter(self) -> Reader: @@ -503,14 +511,23 @@ def get_distributed_nodes(self) -> Sequence[ClickhouseNode]: def get_connection_id(self) -> ConnectionId: return ConnectionId( hostname=self.__query_node.host_name, - tcp_port=self.__query_node.port, + tcp_port=self.__query_node.native_port, http_port=self.__http_port, database_name=self.__database, ) def __get_cluster_nodes(self, cluster_name: str) -> Sequence[ClickhouseNode]: + # system.clusters only reports the native port; every node in the + # cluster shares the cluster's configured HTTP port, so stamp it on each + # discovered node so it is self-describing for the HTTP driver. return [ - ClickhouseNode(*host) + ClickhouseNode( + host_name=host[0], + native_port=host[1], + shard=host[2], + replica=host[3], + http_port=self.__http_port, + ) for host in self.get_query_connection(ClickhouseClientSettings.QUERY) .execute( "select host_name, port, shard_num, replica_num from system.clusters where cluster=%(cluster_name)s", diff --git a/snuba/migrations/check_dangerous.py b/snuba/migrations/check_dangerous.py index 2436d70910c..c45b608ebde 100644 --- a/snuba/migrations/check_dangerous.py +++ b/snuba/migrations/check_dangerous.py @@ -27,7 +27,9 @@ def check_dangerous_operation(op: SqlOperation, columns_state: ColumnStatesMapTy table = op.get_table_name() nodes = op.get_nodes() for node in nodes: - old_type = columns_state.get((node.host_name, node.port, table, col.name), None) + old_type = columns_state.get( + (node.host_name, node.native_port, table, col.name), None + ) if old_type: _check_dangerous(old_type, col.type) except DangerousOperationError as err: diff --git a/snuba/migrations/connect.py b/snuba/migrations/connect.py index 05044c68a28..901eda46219 100644 --- a/snuba/migrations/connect.py +++ b/snuba/migrations/connect.py @@ -161,9 +161,9 @@ def check_for_inactive_replicas(clusters: List[ClickhouseCluster]) -> None: inactive_replica_info = [] for cluster in clusters: for node in cluster.get_local_nodes(): - if (node.host_name, node.port) in checked_nodes: + if (node.host_name, node.native_port) in checked_nodes: continue - checked_nodes.add((node.host_name, node.port)) + checked_nodes.add((node.host_name, node.native_port)) conn = cluster.get_node_connection(ClickhouseClientSettings.MIGRATE, node) tables_with_inactive = conn.execute( f"SELECT table, total_replicas, active_replicas FROM system.replicas " @@ -197,9 +197,9 @@ def get_column_states() -> ColumnStatesMapType: except UndefinedClickhouseCluster: continue for node in (*local_nodes, *distributed_nodes, query_node): - if (node.host_name, node.port) in checked_nodes: + if (node.host_name, node.native_port) in checked_nodes: continue - checked_nodes.add((node.host_name, node.port)) + checked_nodes.add((node.host_name, node.native_port)) conn = cluster.get_node_connection(ClickhouseClientSettings.MIGRATE, node) column_types = conn.execute( @@ -208,6 +208,6 @@ def get_column_states() -> ColumnStatesMapType: for row in column_types: table, col_name, type = row - column_states[(node.host_name, node.port, table, col_name)] = type + column_states[(node.host_name, node.native_port, table, col_name)] = type return column_states diff --git a/tests/clusters/test_cluster.py b/tests/clusters/test_cluster.py index 8dfc53a376b..35ee5815fd6 100644 --- a/tests/clusters/test_cluster.py +++ b/tests/clusters/test_cluster.py @@ -183,7 +183,7 @@ def test_get_local_nodes() -> None: local_cluster = get_storage(StorageKey("errors")).get_cluster() assert len(local_cluster.get_local_nodes()) == 1 assert local_cluster.get_local_nodes()[0].host_name == "host_1" - assert local_cluster.get_local_nodes()[0].port == 9000 + assert local_cluster.get_local_nodes()[0].native_port == 9000 assert local_cluster.get_local_nodes()[0].shard is None assert local_cluster.get_local_nodes()[0].replica is None From 58ac33046fded8fffd0b6d466472375a17d3b075 Mon Sep 17 00:00:00 2001 From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com> Date: Thu, 18 Jun 2026 13:34:07 +0000 Subject: [PATCH 26/29] [getsentry/action-github-commit] Auto commit --- snuba/cli/cleanup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/snuba/cli/cleanup.py b/snuba/cli/cleanup.py index bd3d7f7d0c8..4616f2ac425 100644 --- a/snuba/cli/cleanup.py +++ b/snuba/cli/cleanup.py @@ -89,9 +89,7 @@ def cleanup( # the abstract ClickhousePool type. connection = connection_cache.get_node_connection( ClickhouseClientSettings.CLEANUP, - ClickhouseNode( - clickhouse_host, clickhouse_port, http_port=cluster.get_http_port() - ), + ClickhouseNode(clickhouse_host, clickhouse_port, http_port=cluster.get_http_port()), clickhouse_user, clickhouse_password, database, From d08030028de4602aed0e4b2b4d85182f4a3612e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 13:38:26 +0000 Subject: [PATCH 27/29] fix(tests): Satisfy strict mypy in connect/admin test files Pre-commit mypy (no-implicit-reexport, full test coverage) flagged two issues the source-only runs missed: - test_connect.py: cast the FakeFormattedQuery stubs to FormattedQuery when calling ClickhouseReader.execute. - test_system_queries.py: import the connection_cache singleton from snuba.clusters.cluster instead of reaching through snuba.admin.clickhouse.common (which doesn't re-export it); patching the method on the shared object still covers the reference bound inside common. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- tests/admin/test_system_queries.py | 5 ++++- tests/clickhouse/test_connect.py | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/admin/test_system_queries.py b/tests/admin/test_system_queries.py index af989d30f38..772a3a9342e 100644 --- a/tests/admin/test_system_queries.py +++ b/tests/admin/test_system_queries.py @@ -307,6 +307,7 @@ def test_clusterless_rejects_unvalidated_host( InvalidNodeError and no credentials ever leave the process. """ from snuba.admin.clickhouse import common + from snuba.clusters.cluster import connection_cache helper = getattr(common, helper_name) @@ -316,7 +317,9 @@ def test_clusterless_rejects_unvalidated_host( "_validate_node", side_effect=InvalidNodeError("host not in cluster"), ) as mock_validate, - patch.object(common.connection_cache, "get_node_connection") as mock_pool, + # connection_cache is the shared singleton; patching the method on the + # object affects the reference bound inside common as well. + patch.object(connection_cache, "get_node_connection") as mock_pool, ): # Clear any cached connection for this storage so the cache lookup # can't short-circuit validation. diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index 68f2f21b83f..aeebc6c93a5 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -1,10 +1,11 @@ -from typing import Any +from typing import Any, cast from unittest import mock import pytest from snuba.clickhouse.connect import ClickhouseConnectPool from snuba.clickhouse.errors import ClickhouseError +from snuba.clickhouse.formatter.nodes import FormattedQuery # Error code returned by ClickHouse when the maximum number of simultaneous # queries has been exceeded. @@ -253,7 +254,7 @@ def get_sql(self) -> str: pool = _make_pool(client) reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) - result = reader.execute(FakeFormattedQuery(), with_totals=True) + result = reader.execute(cast(FormattedQuery, FakeFormattedQuery()), with_totals=True) # The trailing row is split out as totals; only the real rows remain in data. assert result["data"] == [ @@ -314,7 +315,7 @@ def get_sql(self) -> str: pool = _make_pool(client) reader = ClickhouseReader(cache_partition_id=None, client=pool, query_settings_prefix=None) - result = reader.execute(FakeFormattedQuery()) + result = reader.execute(cast(FormattedQuery, FakeFormattedQuery())) # Date/DateTime (incl. the parametrized tz variant) become ISO strings and # UUID (incl. Nullable) becomes a string. Had the regexes failed to match From 9a7160857ee38d32ddfd3b96ff7f4636a0a7ddb9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 13:39:47 +0000 Subject: [PATCH 28/29] docs(clickhouse): Document the HTTP-path tracing limitation The admin API is driver-agnostic for query execution, but the trace view and its profile-events parsing depend on trace_output, which clickhouse-connect cannot capture (it only reads the X-ClickHouse-Summary header). Document this accepted limitation where trace logs are requested and where trace_output is emitted, noting that the system.text_log-by-query_id reconstruction would be a separate feature. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/clickhouse/connect.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/snuba/clickhouse/connect.py b/snuba/clickhouse/connect.py index 8484e92f5ae..0dc8ce92444 100644 --- a/snuba/clickhouse/connect.py +++ b/snuba/clickhouse/connect.py @@ -158,6 +158,14 @@ def _build_query_settings( if query_id is not None: query_settings["query_id"] = query_id if capture_trace: + # We still ask the server to emit trace logs, but unlike the native + # driver clickhouse-connect does not surface them (it only reads the + # X-ClickHouse-Summary header), so ``trace_output`` ends up empty on + # this path. See the note in _execute_once. Practically this means + # the snuba-admin trace view and its profile-events parsing return + # nothing when the HTTP driver is enabled; every other admin query + # path is driver-agnostic. Reconstructing traces over HTTP would + # require querying system.text_log by query_id (a separate feature). query_settings["send_logs_level"] = "trace" return query_settings or None @@ -210,6 +218,10 @@ def _int(key: str) -> int: results: Sequence[Any] = query_result.result_set + # trace_output is always empty here: clickhouse-connect has no mechanism + # for capturing the server's send_logs_level output (it only parses the + # X-ClickHouse-Summary header for the profile above). This is a known, + # accepted limitation of the HTTP path — see _build_query_settings. if with_column_types: meta = [ (name, column_type.name) From fe24fb72393f7fa4299968fed7e50db1b48cfbb5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 13:56:30 +0000 Subject: [PATCH 29/29] fix(clickhouse): Scope the 30s read timeout to user reads only The QUERY profile timeout went None->30 to give user-facing reads a 30s cap, but the QUERY profile is also borrowed by non-read callers that can legitimately run longer: cluster topology discovery (system.clusters), storage-routing load lookups, delete-throttling system-table checks, the span-export manual job, and admin table copies. Capping those at 30s could break them on large/loaded clusters. Add an unbounded INTERNAL client-settings profile and move those callers onto it, restoring their pre-PR (unbounded) behavior, while the user read path (get_reader -> QUERY) keeps the 30s cap. Add a test asserting QUERY is 30s and INTERNAL is unbounded. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_011MC5N7AYMhJDBkJ4P7engC --- snuba/admin/clickhouse/copy_tables.py | 4 +++- snuba/clusters/cluster.py | 13 ++++++++++--- snuba/manual_jobs/extract_span_data.py | 2 +- snuba/web/delete_query.py | 4 ++-- snuba/web/rpc/storage_routing/load_retriever.py | 4 ++-- tests/clickhouse/test_connect.py | 10 ++++++++++ 6 files changed, 28 insertions(+), 9 deletions(-) diff --git a/snuba/admin/clickhouse/copy_tables.py b/snuba/admin/clickhouse/copy_tables.py index 52cdd90bb78..c3d44f27766 100644 --- a/snuba/admin/clickhouse/copy_tables.py +++ b/snuba/admin/clickhouse/copy_tables.py @@ -128,7 +128,9 @@ def copy_tables( skip_on_cluster: bool = False, cluster_name_override: Optional[str] = None, ) -> CopyTablesResponse: - settings = ClickhouseClientSettings.QUERY + # Table copies can run long, so use the unbounded INTERNAL profile rather + # than the 30s user-read QUERY profile. + settings = ClickhouseClientSettings.INTERNAL source_connection = get_clusterless_node_connection( source_host, 9000, storage_name, client_settings=settings ) diff --git a/snuba/clusters/cluster.py b/snuba/clusters/cluster.py index f2bb8d40f3c..9046c537fdf 100644 --- a/snuba/clusters/cluster.py +++ b/snuba/clusters/cluster.py @@ -74,9 +74,16 @@ class ClickhouseClientSettings(Enum): ) DELETE = ClickhouseClientSettingsType({"mutations_sync": 1}, None) OPTIMIZE = ClickhouseClientSettingsType({}, settings.OPTIMIZE_QUERY_TIMEOUT) - # Read queries get a 30s timeout. Migrations, DDL and other long-running - # operations keep their own (default or longer) timeouts above/below. + # User-facing read queries get a 30s timeout. Migrations, DDL and other + # long-running operations keep their own (default or longer) timeouts + # above/below. QUERY = ClickhouseClientSettingsType({}, 30) + # Internal/maintenance queries that are NOT user-facing reads and must not + # inherit QUERY's 30s cap: cluster topology discovery (system.clusters), + # storage-routing load lookups, delete-throttling system-table checks, the + # span-export job and admin table copies. These can legitimately run long, + # so they stay unbounded (their behavior before QUERY got a read timeout). + INTERNAL = ClickhouseClientSettingsType({}, None) QUERYLOG = ClickhouseClientSettingsType({}, None) TRACING = ClickhouseClientSettingsType({"readonly": 2}, None) REPLACE = ClickhouseClientSettingsType( @@ -528,7 +535,7 @@ def __get_cluster_nodes(self, cluster_name: str) -> Sequence[ClickhouseNode]: replica=host[3], http_port=self.__http_port, ) - for host in self.get_query_connection(ClickhouseClientSettings.QUERY) + for host in self.get_query_connection(ClickhouseClientSettings.INTERNAL) .execute( "select host_name, port, shard_num, replica_num from system.clusters where cluster=%(cluster_name)s", {"cluster_name": cluster_name}, diff --git a/snuba/manual_jobs/extract_span_data.py b/snuba/manual_jobs/extract_span_data.py index d7ca7767cc0..9dc1ccfb6c0 100644 --- a/snuba/manual_jobs/extract_span_data.py +++ b/snuba/manual_jobs/extract_span_data.py @@ -112,7 +112,7 @@ def _generate_spans_query(self) -> str: def execute(self, logger: JobLogger) -> None: cluster = get_cluster(StorageSetKey.EVENTS_ANALYTICS_PLATFORM) - connection = cluster.get_query_connection(ClickhouseClientSettings.QUERY) + connection = cluster.get_query_connection(ClickhouseClientSettings.INTERNAL) query = f""" INSERT INTO FUNCTION gcs('{self._gcp_bucket_name}/{self._output_file_path}', diff --git a/snuba/web/delete_query.py b/snuba/web/delete_query.py index 075917d1751..4d3d1c74b59 100644 --- a/snuba/web/delete_query.py +++ b/snuba/web/delete_query.py @@ -199,7 +199,7 @@ def _num_ongoing_mutations(cluster: ClickhouseCluster, tables: Sequence[str]) -> ) """ return int( - cluster.get_query_connection(ClickhouseClientSettings.QUERY).execute(query).results[0][0] + cluster.get_query_connection(ClickhouseClientSettings.INTERNAL).execute(query).results[0][0] ) @@ -218,7 +218,7 @@ def _num_parts_currently_mutating(cluster: ClickhouseCluster) -> int: WHERE metric = 'PartMutation' """ return int( - cluster.get_query_connection(ClickhouseClientSettings.QUERY).execute(query).results[0][0] + cluster.get_query_connection(ClickhouseClientSettings.INTERNAL).execute(query).results[0][0] ) diff --git a/snuba/web/rpc/storage_routing/load_retriever.py b/snuba/web/rpc/storage_routing/load_retriever.py index 43b8775f0dc..cd6a0fa2638 100644 --- a/snuba/web/rpc/storage_routing/load_retriever.py +++ b/snuba/web/rpc/storage_routing/load_retriever.py @@ -131,12 +131,12 @@ def get_cluster_loadinfo( """ cluster_load = float( - cluster.get_query_connection(ClickhouseClientSettings.QUERY) + cluster.get_query_connection(ClickhouseClientSettings.INTERNAL) .execute(cluster_load_query) .results[0][0] ) concurrent_queries = int( - cluster.get_query_connection(ClickhouseClientSettings.QUERY) + cluster.get_query_connection(ClickhouseClientSettings.INTERNAL) .execute(concurrent_queries_query) .results[0][0] ) diff --git a/tests/clickhouse/test_connect.py b/tests/clickhouse/test_connect.py index aeebc6c93a5..1ab43d453d2 100644 --- a/tests/clickhouse/test_connect.py +++ b/tests/clickhouse/test_connect.py @@ -182,6 +182,16 @@ def test_read_query_client_settings_use_30s_timeout() -> None: assert ClickhouseClientSettings.QUERY.value.timeout == 30 +def test_internal_profile_is_unbounded() -> None: + # The 30s read timeout must not leak onto internal/maintenance queries + # (topology discovery, routing load lookups, delete throttling checks, the + # span-export job, table copies). They use the INTERNAL profile, which stays + # unbounded so long-running operations aren't capped at 30s. + from snuba.clusters.cluster import ClickhouseClientSettings + + assert ClickhouseClientSettings.INTERNAL.value.timeout is None + + def test_pool_size_defaults_to_setting() -> None: import clickhouse_connect