From 2ea4aaa0bbb2de8125883cb27e973310d6b64f8e Mon Sep 17 00:00:00 2001 From: Milind Srivastava Date: Thu, 21 May 2026 17:09:10 -0400 Subject: [PATCH] feat(tools): added Hydra config validation for Clickhouse experiment config --- asap-tools/experiments/config/config.yaml | 5 + .../experiments/experiment_utils/config.py | 105 ++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/asap-tools/experiments/config/config.yaml b/asap-tools/experiments/config/config.yaml index a62495a6..90bee69a 100644 --- a/asap-tools/experiments/config/config.yaml +++ b/asap-tools/experiments/config/config.yaml @@ -78,6 +78,11 @@ fake_exporter_language: "rust" # choices: ["python", "rust"] # Cluster data exporter configuration cluster_data_directory: "/data/cluster_traces" # Path to directory containing Google/Alibaba cluster trace data +# ClickHouse connection defaults (overridden per experiment_type config or CLI) +clickhouse: + url: "http://localhost:8123" + database: "default" + # Backend configuration for the query engine (aligned with BackendConfig in asap-query-engine/src/engine_config.rs) backend: type: "prometheus" # choices: ["prometheus", "clickhouse", "elastic_querydsl", "elastic_sql"] diff --git a/asap-tools/experiments/experiment_utils/config.py b/asap-tools/experiments/experiment_utils/config.py index 51af0461..a0520ee8 100644 --- a/asap-tools/experiments/experiment_utils/config.py +++ b/asap-tools/experiments/experiment_utils/config.py @@ -55,6 +55,74 @@ def validate_basic_config( raise ValueError(error_msg) +def _is_clickhouse_experiment(experiment_params: DictConfig) -> bool: + """Return True if experiment_params describes a ClickHouse (SQL) experiment.""" + return "dataset" in experiment_params + + +def _validate_clickhouse_experiment_config(experiment_params: DictConfig) -> None: + """Validate experiment_params for a ClickHouse experiment.""" + skip_querying = experiment_params.get("skip_querying", False) + + # Validate dataset section + if "dataset" not in experiment_params: + raise ValueError( + "ClickHouse experiments require a 'dataset' section in experiment config. " + "Add dataset.name and dataset.local_data_file." + ) + dataset = experiment_params.dataset + valid_dataset_names = {"clickbench", "h2o", "custom"} + dataset_name = dataset.get("name") + if not dataset_name or dataset_name == "???": + raise ValueError( + "dataset.name is required. " f"Valid choices: {valid_dataset_names}" + ) + if dataset_name not in valid_dataset_names: + raise ValueError( + f"dataset.name={dataset_name!r} is not valid. " + f"Valid choices: {valid_dataset_names}" + ) + + local_data_file = dataset.get("local_data_file") + if not local_data_file or local_data_file == "???": + raise ValueError( + "dataset.local_data_file is required. " + "Provide the path to the JSON-lines data file on this machine." + ) + if not os.path.exists(local_data_file): + raise ValueError( + f"dataset.local_data_file={local_data_file!r} does not exist. " + "Run benchmark/prepare_data.py first to produce the JSON-lines file." + ) + + # Validate query_groups (required unless skip_querying) + if not skip_querying: + if ( + "query_groups" not in experiment_params + or not experiment_params.query_groups + ): + raise ValueError( + "At least one query group must be defined in experiment config " + "when skip_querying=False" + ) + for i, group in enumerate(experiment_params.query_groups): + sql_file = group.get("sql_file") + if not sql_file or sql_file == "???": + raise ValueError( + f"Query group {i} missing 'sql_file'. " + "Generate SQL files with benchmark/generate_queries.py first." + ) + if not os.path.exists(sql_file): + raise ValueError( + f"Query group {i} sql_file={sql_file!r} does not exist." + ) + elif "query_groups" in experiment_params and experiment_params.query_groups: + print("-" * 60) + print("WARNING: query_groups is present but will be IGNORED") + print(" skip_querying=True means no queries will be executed") + print("-" * 60) + + def validate_experiment_config( experiment_params: DictConfig, require_queries: bool = True ): @@ -65,6 +133,11 @@ def validate_experiment_config( experiment_params: The experiment parameters configuration require_queries: Whether to require query_groups to be non-empty (default: True) """ + # ClickHouse experiments have a different required structure + if _is_clickhouse_experiment(experiment_params): + _validate_clickhouse_experiment_config(experiment_params) + return + # Check for skip_querying mode skip_querying = experiment_params.get("skip_querying", False) @@ -352,6 +425,26 @@ def check_exporter_and_queries_exist( return False +def read_sql_queries(cfg: DictConfig) -> List[Tuple[str, str]]: + """Return list of (name, sql_file_path) pairs from a ClickHouse experiment config. + + Args: + cfg: Top-level Hydra config (cfg.experiment_params.query_groups is used). + + Returns: + List of (group_name, sql_file_path) tuples. + """ + query_groups = cfg.experiment_params.query_groups + result = [] + for i, group in enumerate(query_groups): + name = group.get("name", str(i)) + sql_file = group.get("sql_file") + if not sql_file: + raise ValueError(f"Query group {i!r} ({name!r}) missing 'sql_file'") + result.append((name, sql_file)) + return result + + def read_workloads_config(experiment_params: DictConfig): """Read and validate workloads configuration.""" if "workloads" not in experiment_params: @@ -543,6 +636,18 @@ def validate_config(cfg: DictConfig, script_name: str = "experiment_run_e2e"): f"Valid options: {valid_policies}" ) + # ClickHouse backend requires dataset config in experiment_params + if ( + hasattr(cfg, "backend") + and cfg.backend.get("type") == "clickhouse" + and hasattr(cfg, "experiment_params") + and "dataset" not in cfg.experiment_params + ): + raise ValueError( + "backend.type=clickhouse requires experiment_params.dataset to be set. " + "Use experiment_type=clickhouse or add a dataset section to your experiment config." + ) + def _load_sql_queries(sql_file: str) -> List[str]: """Read a SQL file and return individual statements, preserving comment lines."""