diff --git a/2.0/problems/duckdb_e2e_query_optimization/config.yaml b/2.0/problems/duckdb_e2e_query_optimization/config.yaml
new file mode 100644
index 00000000..be09ae9f
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/config.yaml
@@ -0,0 +1,47 @@
+tag: systems
+runtime:
+  language: cpp
+  timeout_seconds: 10800
+  environment: "DuckDB source patch; TPC-H shell timing; experimental judge"
+  apt_packages:
+    - bash
+    - build-essential
+    - ca-certificates
+    - ccache
+    - cmake
+    - git
+    - ninja-build
+    - pkg-config
+    - python3
+  judge_apt_packages:
+    - bash
+    - build-essential
+    - ca-certificates
+    - ccache
+    - cmake
+    - git
+    - ninja-build
+    - pkg-config
+    - python3
+  docker:
+    # Experimental local images. Build them with
+    # 2.0/problems/duckdb_e2e_query_optimization/docker/build_images.sh before running a
+    # local Harbor trial.
+    image: frontiercs/duckdb-e2e-query-optimization-agent:experimental-v1.5.3
+    judge_image: frontiercs/duckdb-e2e-query-optimization-judge:experimental-v1.5.3
+environment:
+  cpus: 8
+  memory_mb: 16384
+  storage_mb: 32768
+  build_timeout_seconds: 7200
+evaluation:
+  scale_factor: 1
+  benchmark_repetitions: 3
+  build_timeout_seconds: 7200
+  query_timeout_seconds: 300
+  duckdb_memory_limit: "6GB"
+  duckdb_temp_limit: "2GB"
+  child_memory_mb: 12288
+submission:
+  kind: file
+  path: /app/solution.patch
diff --git a/2.0/problems/duckdb_e2e_query_optimization/docker/README.md b/2.0/problems/duckdb_e2e_query_optimization/docker/README.md
new file mode 100644
index 00000000..f46b99a2
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/docker/README.md
@@ -0,0 +1,54 @@
+# Experimental DuckDB Images
+
+This task needs DuckDB source trees in both the agent and judge containers.
+The standard Frontier-CS 2.0 adapter can select Docker base images, but it does
+not build problem-specific images by itself. Build these images before running
+a local Harbor trial:
+
+```bash
+bash 2.0/problems/duckdb_e2e_query_optimization/docker/build_images.sh
+```
+
+Defaults:
+
+```text
+DUCKDB_REF=v1.5.3
+DUCKDB_BUILD_JOBS=1
+AGENT_TAG=frontiercs/duckdb-e2e-query-optimization-agent:experimental-v1.5.3
+JUDGE_TAG=frontiercs/duckdb-e2e-query-optimization-judge:experimental-v1.5.3
+```
+
+The agent image contains:
+
+```text
+/app/duckdb
+```
+
+The judge image contains:
+
+```text
+/opt/duckdb-vanilla
+/opt/duckdb-clean
+```
+
+`/opt/duckdb-clean` is prebuilt with:
+
+```bash
+CMAKE_BUILD_PARALLEL_LEVEL=1 GEN=ninja DISABLE_UNITY=1 DISABLE_PARQUET=1 BUILD_JEMALLOC=0 BUILD_BENCHMARK=1 BUILD_EXTENSIONS='tpch' make
+```
+
+`DUCKDB_BUILD_JOBS` intentionally defaults to `1`; higher parallelism can make
+large DuckDB C++ objects exhaust Docker memory on local machines. Unity builds
+are disabled for the same reason. Jemalloc is disabled because DuckDB v1.5.3's
+non-unity build exposes a missing include in the jemalloc allocator wrapper;
+vanilla and patched binaries are built with the same allocator setting.
+
+The judge copies that prebuilt tree to `/opt/duckdb-vanilla` during image
+construction. At evaluation time the evaluator first resets tracked files in
+`/opt/duckdb-clean` back to `HEAD` and removes untracked files under `src`
+without deleting ignored build artifacts, applies the submitted patch, performs
+an incremental rebuild of the DuckDB shell and optional benchmark runner,
+checks correctness against `/opt/duckdb-vanilla`, and then times TPC-H queries
+through DuckDB's shell and TPC-H extension. Keeping build artifacts in the clean
+tree is intentional; full DuckDB rebuilds are too slow for iterative Harbor
+submissions.
diff --git a/2.0/problems/duckdb_e2e_query_optimization/docker/agent/Dockerfile b/2.0/problems/duckdb_e2e_query_optimization/docker/agent/Dockerfile
new file mode 100644
index 00000000..c3d3472a
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/docker/agent/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:24.04
+
+ARG DUCKDB_REF=v1.5.3
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        bash \
+        build-essential \
+        ca-certificates \
+        ccache \
+        cmake \
+        git \
+        ninja-build \
+        pkg-config \
+        python3 \
+        python3-pip \
+        curl \
+        ripgrep && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone --branch "${DUCKDB_REF}" --depth 1 https://github.com/duckdb/duckdb.git /app/duckdb && \
+    cd /app/duckdb && \
+    git submodule update --init --recursive
+
+WORKDIR /app
diff --git a/2.0/problems/duckdb_e2e_query_optimization/docker/build_images.sh b/2.0/problems/duckdb_e2e_query_optimization/docker/build_images.sh
new file mode 100755
index 00000000..19ef3e2a
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/docker/build_images.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+TASK_DIR=$(cd "$SCRIPT_DIR/.." && pwd)
+
+DUCKDB_REF="${DUCKDB_REF:-v1.5.3}"
+DUCKDB_BUILD_JOBS="${DUCKDB_BUILD_JOBS:-1}"
+AGENT_TAG="${AGENT_TAG:-frontiercs/duckdb-e2e-query-optimization-agent:experimental-v1.5.3}"
+JUDGE_TAG="${JUDGE_TAG:-frontiercs/duckdb-e2e-query-optimization-judge:experimental-v1.5.3}"
+
+docker build \
+  --build-arg "DUCKDB_REF=$DUCKDB_REF" \
+  -t "$AGENT_TAG" \
+  "$TASK_DIR/docker/agent"
+
+docker build \
+  --build-arg "DUCKDB_REF=$DUCKDB_REF" \
+  --build-arg "DUCKDB_BUILD_JOBS=$DUCKDB_BUILD_JOBS" \
+  -t "$JUDGE_TAG" \
+  "$TASK_DIR/docker/judge"
+
+echo "Built:"
+echo "  $AGENT_TAG"
+echo "  $JUDGE_TAG"
diff --git a/2.0/problems/duckdb_e2e_query_optimization/docker/judge/Dockerfile b/2.0/problems/duckdb_e2e_query_optimization/docker/judge/Dockerfile
new file mode 100644
index 00000000..ce5efc44
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/docker/judge/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.7
+FROM ubuntu:24.04
+
+ARG DUCKDB_REF=v1.5.3
+ARG DUCKDB_BUILD_JOBS=1
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        bash \
+        build-essential \
+        ca-certificates \
+        ccache \
+        cmake \
+        git \
+        ninja-build \
+        pkg-config \
+        python3 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone --branch "${DUCKDB_REF}" --depth 1 https://github.com/duckdb/duckdb.git /opt/duckdb-clean && \
+    cd /opt/duckdb-clean && \
+    git submodule update --init --recursive
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    cd /opt/duckdb-clean && \
+    GEN=ninja CMAKE_BUILD_PARALLEL_LEVEL="${DUCKDB_BUILD_JOBS}" DISABLE_UNITY=1 DISABLE_PARQUET=1 BUILD_JEMALLOC=0 BUILD_UNITTESTS=0 BUILD_BENCHMARK=1 BUILD_EXTENSIONS='tpch' make -j"${DUCKDB_BUILD_JOBS}" && \
+    cp -a /opt/duckdb-clean /opt/duckdb-vanilla
+
+WORKDIR /judge
diff --git a/2.0/problems/duckdb_e2e_query_optimization/docker/smoke_images.sh b/2.0/problems/duckdb_e2e_query_optimization/docker/smoke_images.sh
new file mode 100755
index 00000000..705ddbd2
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/docker/smoke_images.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+AGENT_TAG="${AGENT_TAG:-frontiercs/duckdb-e2e-query-optimization-agent:experimental-v1.5.3}"
+JUDGE_TAG="${JUDGE_TAG:-frontiercs/duckdb-e2e-query-optimization-judge:experimental-v1.5.3}"
+
+echo "[agent] checking $AGENT_TAG"
+docker run --rm "$AGENT_TAG" sh -lc '
+  test -d /app/duckdb/.git
+  git -C /app/duckdb rev-parse HEAD
+'
+
+echo "[judge] checking $JUDGE_TAG"
+docker run --rm "$JUDGE_TAG" sh -lc '
+  test -d /opt/duckdb-clean
+  test -d /opt/duckdb-vanilla
+  test -x /opt/duckdb-vanilla/build/release/duckdb
+  test -x /opt/duckdb-vanilla/build/release/benchmark/benchmark_runner
+  /opt/duckdb-vanilla/build/release/duckdb -c "LOAD tpch; SELECT count(*) FROM tpch_queries();"
+'
diff --git a/2.0/problems/duckdb_e2e_query_optimization/evaluate.sh b/2.0/problems/duckdb_e2e_query_optimization/evaluate.sh
new file mode 100755
index 00000000..6f518849
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/evaluate.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+if [[ $# -gt 0 ]]; then
+  exec python3 "$SCRIPT_DIR/evaluator.py" "$@"
+fi
+
+SOLUTION="/work/execution_env/solution_env/solution.patch"
+if [[ ! -f "$SOLUTION" ]]; then
+  echo "Error: Missing $SOLUTION" >&2
+  exit 1
+fi
+
+python3 "$SCRIPT_DIR/evaluator.py" "$SOLUTION"
diff --git a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
new file mode 100644
index 00000000..d5ce2148
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
@@ -0,0 +1,1081 @@
+"""Evaluator for the experimental DuckDB E2E query optimization task."""
+
+from __future__ import annotations
+
+import fnmatch
+import hashlib
+import json
+import math
+import os
+import posixpath
+import random
+import re
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+MAX_PATCH_BYTES = 1_500_000
+MAX_CHANGED_FILES = 80
+TASK_CONFIG_PATH = Path("/judge/task_config.json")
+
+
+def _load_task_config() -> dict[str, Any]:
+    try:
+        payload = json.loads(TASK_CONFIG_PATH.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+    return payload if isinstance(payload, dict) else {}
+
+
+TASK_CONFIG = _load_task_config()
+EVALUATION_CONFIG = TASK_CONFIG.get("evaluation", {}) if isinstance(TASK_CONFIG.get("evaluation"), dict) else {}
+
+
+def _config_int(env_name: str, config_name: str, default: int) -> int:
+    del env_name
+    raw = EVALUATION_CONFIG.get(config_name, default)
+    try:
+        return int(raw)
+    except Exception:
+        return default
+
+
+def _config_str(env_name: str, config_name: str, default: str) -> str:
+    del env_name
+    raw = EVALUATION_CONFIG.get(config_name, default)
+    return str(raw)
+
+
+def _config_bool(env_name: str, config_name: str, default: bool) -> bool:
+    del env_name
+    raw = EVALUATION_CONFIG.get(config_name, default)
+    if isinstance(raw, bool):
+        return raw
+    if isinstance(raw, str):
+        return raw.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(raw)
+
+
+BUILD_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_BUILD_TIMEOUT", "build_timeout_seconds", 3600)
+QUERY_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_QUERY_TIMEOUT", "query_timeout_seconds", 300)
+DUCKDB_MEMORY_LIMIT = _config_str("FRONTIER_DUCKDB_MEMORY_LIMIT", "duckdb_memory_limit", "6GB")
+DUCKDB_TEMP_LIMIT = _config_str("FRONTIER_DUCKDB_TEMP_LIMIT", "duckdb_temp_limit", "2GB")
+CHILD_MEMORY_LIMIT_MB = _config_int("FRONTIER_DUCKDB_CHILD_MEMORY_MB", "child_memory_mb", 12288)
+USE_BENCHMARK_RUNNER = _config_bool("FRONTIER_DUCKDB_USE_BENCHMARK_RUNNER", "use_benchmark_runner", False)
+
+DEFAULT_CLEAN_SOURCE = Path("/opt/duckdb-clean")
+DEFAULT_VANILLA_SOURCE = Path("/opt/duckdb-vanilla")
+PUBLIC_SCALE_FACTOR = _config_str("FRONTIER_DUCKDB_TPCH_SF", "scale_factor", "1")
+DEFAULT_HIDDEN_SCALE_FACTORS = ("0.1", "2")
+QUERY_ORDER_SEED = _config_int("FRONTIER_DUCKDB_QUERY_ORDER_SEED", "query_order_seed", 20260604)
+
+BENCHMARK_RUNNER_REL = Path("build/release/benchmark/benchmark_runner")
+DUCKDB_SHELL_REL = Path("build/release/duckdb")
+DEFAULT_CORRECTNESS_QUERIES = tuple(range(1, 23))
+
+STRONGLY_ALLOWED_PATTERNS = (
+    "src/optimizer/**",
+    "src/include/duckdb/optimizer/**",
+    "src/execution/operator/join/**",
+    "src/include/duckdb/execution/operator/join/**",
+    "src/execution/operator/filter/**",
+    "src/include/duckdb/execution/operator/filter/**",
+    "src/planner/operator/logical_join.cpp",
+    "src/planner/operator/logical_comparison_join.cpp",
+    "src/include/duckdb/planner/operator/logical_join.hpp",
+    "src/include/duckdb/planner/operator/logical_comparison_join.hpp",
+)
+
+CONDITIONALLY_ALLOWED_PATTERNS = (
+    "src/planner/**",
+    "src/include/duckdb/planner/**",
+    "src/execution/physical_plan/**",
+    "src/include/duckdb/execution/physical_plan/**",
+    "src/common/**",
+    "src/include/duckdb/common/**",
+)
+
+BUILD_ALLOWED_PATTERNS = (
+    "CMakeLists.txt",
+    "src/CMakeLists.txt",
+    "src/optimizer/CMakeLists.txt",
+    "src/optimizer/**/CMakeLists.txt",
+    "src/planner/CMakeLists.txt",
+    "src/planner/**/CMakeLists.txt",
+    "src/planner/operator/CMakeLists.txt",
+    "src/common/CMakeLists.txt",
+    "src/common/**/CMakeLists.txt",
+    "src/execution/CMakeLists.txt",
+    "src/execution/operator/CMakeLists.txt",
+    "src/execution/operator/join/CMakeLists.txt",
+    "src/execution/operator/join/**/CMakeLists.txt",
+    "src/execution/operator/filter/CMakeLists.txt",
+    "src/execution/operator/filter/**/CMakeLists.txt",
+    "src/execution/physical_plan/CMakeLists.txt",
+    "src/execution/physical_plan/**/CMakeLists.txt",
+    "extension_config.cmake",
+)
+
+DENIED_PATTERNS = (
+    "benchmark/**",
+    "test/**",
+    "tools/**",
+    "scripts/**",
+    "extension/tpch/**",
+    "third_party/**",
+    "examples/**",
+    "docs/**",
+    ".github/**",
+    "src/main/**",
+    "src/storage/**",
+    "src/catalog/**",
+    "src/parser/**",
+    "src/function/**",
+    "src/common/file_system.cpp",
+    "src/main/client_context.cpp",
+    "src/main/database.cpp",
+    "CMakePresets.json",
+    "Makefile",
+    "package.json",
+    "requirements.txt",
+)
+
+ENVIRONMENT_TOKENS = (
+    "getenv",
+    "std::getenv",
+    "secure_getenv",
+    "setenv",
+    "putenv",
+    "unsetenv",
+    "environ",
+    "GetEnvironmentVariable",
+    "FileSystem::GetEnv",
+)
+
+CMAKE_DENY_TOKENS = (
+    "add_custom_command",
+    "add_custom_target",
+    "execute_process",
+    "FetchContent",
+    "ExternalProject",
+    "file(DOWNLOAD",
+    "configure_file",
+    "target_compile_options",
+    "target_link_options",
+    "set(CMAKE_",
+    "install(",
+)
+
+HARD_CODE_TOKENS = (
+    "tpch",
+    "lineitem",
+    "orders",
+    "customer",
+    "supplier",
+    "part",
+    "partsupp",
+    "nation",
+    "region",
+    "q01",
+    "q02",
+    "q03",
+    "q04",
+    "q05",
+    "q06",
+    "q07",
+    "q08",
+    "q09",
+    "q10",
+    "q11",
+    "q12",
+    "q13",
+    "q14",
+    "q15",
+    "q16",
+    "q17",
+    "q18",
+    "q19",
+    "q20",
+    "q21",
+    "q22",
+    "benchmark",
+)
+
+HARD_CODE_TOKEN_RE = re.compile(
+    r"(?<![A-Za-z0-9_])("
+    + "|".join(re.escape(token) for token in HARD_CODE_TOKENS)
+    + r")(?![A-Za-z0-9_])",
+    re.IGNORECASE,
+)
+
+SOURCE_EXTENSIONS = (".cpp", ".hpp", ".h", ".hh", ".cc", ".cxx")
+
+
+@dataclass(frozen=True)
+class PatchFile:
+    old_path: str
+    new_path: str
+    added_lines: tuple[str, ...]
+    removed_lines: tuple[str, ...]
+
+    @property
+    def path(self) -> str:
+        return self.new_path if self.new_path != "/dev/null" else self.old_path
+
+
+def _match(path: str, patterns: tuple[str, ...]) -> bool:
+    return any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+
+
+def _is_build_file(path: str) -> bool:
+    return _match(path, BUILD_ALLOWED_PATTERNS)
+
+
+def _is_allowed_source_path(path: str) -> bool:
+    return _match(path, STRONGLY_ALLOWED_PATTERNS) or _match(
+        path, CONDITIONALLY_ALLOWED_PATTERNS
+    )
+
+
+def _invalid(message: str, metrics: dict[str, Any] | None = None):
+    payload = metrics or {}
+    payload.setdefault("valid_patch", 0)
+    return 0.0, 0.0, message, payload
+
+
+def _parse_patch(text: str) -> list[PatchFile]:
+    files: list[PatchFile] = []
+    current_old = ""
+    current_new = ""
+    added: list[str] = []
+    removed: list[str] = []
+    in_file = False
+
+    for line in text.splitlines():
+        if line.startswith("diff --git "):
+            if in_file:
+                files.append(PatchFile(current_old, current_new, tuple(added), tuple(removed)))
+            in_file = True
+            current_old = ""
+            current_new = ""
+            added = []
+            removed = []
+            continue
+        if not in_file:
+            continue
+        if line.startswith("--- "):
+            current_old = line[4:].strip()
+            if current_old.startswith("a/"):
+                current_old = current_old[2:]
+            continue
+        if line.startswith("+++ "):
+            current_new = line[4:].strip()
+            if current_new.startswith("b/"):
+                current_new = current_new[2:]
+            continue
+        if line.startswith("+") and not line.startswith("+++ "):
+            added.append(line[1:])
+            continue
+        if line.startswith("-") and not line.startswith("--- "):
+            removed.append(line[1:])
+
+    if in_file:
+        files.append(PatchFile(current_old, current_new, tuple(added), tuple(removed)))
+    return files
+
+
+def _validate_patch_path(path: str, metrics: dict[str, Any]) -> tuple[bool, str, bool, bool]:
+    if not path or path == "/dev/null":
+        return True, "", False, False
+    if path.startswith("/") or ".." in Path(path).parts:
+        return False, f"unsafe patch path: {path}", False, False
+    if _match(path, DENIED_PATTERNS):
+        return False, f"changed file is outside task boundary: {path}", False, False
+
+    build_file = _is_build_file(path)
+    allowed_source = _is_allowed_source_path(path)
+    if not build_file and not allowed_source:
+        return False, f"changed file is not allowlisted: {path}", False, False
+    return True, "", build_file, allowed_source
+
+
+def _validate_build_diff(patch_file: PatchFile) -> str | None:
+    if any(line.strip() for line in patch_file.removed_lines):
+        return "build-system changes may add allowed source files but may not delete or rewrite existing build rules"
+    added_text = "\n".join(patch_file.added_lines)
+    for token in CMAKE_DENY_TOKENS:
+        if token in added_text:
+            return f"build-system change contains forbidden token: {token}"
+    for line in patch_file.added_lines:
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        if ".cpp" in stripped:
+            cpp_paths = re.findall(r"[A-Za-z0-9_./+-]+\.cpp", stripped)
+            if not cpp_paths:
+                return "CMake change mentions .cpp without a parseable source path"
+            for cpp_path in cpp_paths:
+                normalized = cpp_path.strip("./")
+                if not normalized.startswith("src/"):
+                    normalized = posixpath.normpath(
+                        posixpath.join(posixpath.dirname(patch_file.path), normalized)
+                    )
+                if normalized.startswith("../") or not _is_allowed_source_path(normalized):
+                    return f"CMake change wires source outside allowlist: {cpp_path}"
+            continue
+        if stripped in {"(", ")", "set(", "duckdb_sources("}:
+            continue
+        if stripped.endswith("(") or stripped.endswith(")"):
+            continue
+        return "CMake changes may only add allowed .cpp files to existing targets"
+    return None
+
+
+def validate_patch(patch_path: Path) -> tuple[bool, str, dict[str, Any]]:
+    if not patch_path.exists():
+        return False, "solution patch does not exist", {}
+    size = patch_path.stat().st_size
+    if size > MAX_PATCH_BYTES:
+        return False, f"patch is too large ({size} bytes > {MAX_PATCH_BYTES})", {}
+    text = patch_path.read_text(encoding="utf-8", errors="replace")
+    patch_hash = hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
+    files = _parse_patch(text)
+    metrics: dict[str, Any] = {
+        "patch_bytes": size,
+        "patch_sha256": patch_hash,
+        "changed_files": len(files),
+    }
+    if len(files) > MAX_CHANGED_FILES:
+        return False, f"too many changed files ({len(files)} > {MAX_CHANGED_FILES})", metrics
+
+    for patch_file in files:
+        path = patch_file.path
+        if patch_file.new_path == "/dev/null":
+            return False, f"deleting source files is outside task boundary: {patch_file.old_path}", metrics
+        if patch_file.old_path != "/dev/null" and patch_file.old_path != patch_file.new_path:
+            old_ok, old_error, _, _ = _validate_patch_path(patch_file.old_path, metrics)
+            if not old_ok:
+                return False, f"rename/copy source is outside task boundary: {old_error}", metrics
+
+        ok, error, build_file, allowed_source = _validate_patch_path(path, metrics)
+        if not ok:
+            return False, error, metrics
+        if not path or path == "/dev/null":
+            return False, "could not determine changed path from patch", metrics
+
+        if build_file:
+            error = _validate_build_diff(patch_file)
+            if error:
+                return False, f"{path}: {error}", metrics
+
+        if path.endswith(SOURCE_EXTENSIONS):
+            added_text = "\n".join(patch_file.added_lines)
+            for token in ENVIRONMENT_TOKENS:
+                if token in added_text:
+                    return False, f"{path}: environment access is forbidden ({token})", metrics
+            match = HARD_CODE_TOKEN_RE.search(added_text)
+            if match:
+                token = match.group(1)
+                return False, f"{path}: benchmark-specific token is forbidden ({token})", metrics
+
+    metrics["valid_patch"] = 1
+    return True, "patch accepted by static policy", metrics
+
+
+def clean_env(tmp_root: Path) -> dict[str, str]:
+    home = tmp_root / "home"
+    tmp = tmp_root / "tmp"
+    home.mkdir(parents=True, exist_ok=True)
+    tmp.mkdir(parents=True, exist_ok=True)
+    return {
+        "PATH": "/usr/local/bin:/usr/bin:/bin",
+        "HOME": str(home),
+        "TMPDIR": str(tmp),
+        "LC_ALL": "C",
+        "LANG": "C",
+        "CCACHE_DIR": str(tmp_root / "ccache"),
+    }
+
+
+def run_checked(
+    cmd: list[str],
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    timeout: int,
+    input_text: str | None = None,
+) -> subprocess.CompletedProcess[str]:
+    def limit_child() -> None:
+        if os.name != "posix":
+            return
+        try:
+            os.setsid()
+        except Exception:
+            pass
+        try:
+            import resource
+
+            limit_bytes = CHILD_MEMORY_LIMIT_MB * 1024 * 1024
+            resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
+        except Exception:
+            pass
+
+    process = subprocess.Popen(
+        cmd,
+        cwd=str(cwd),
+        env=env,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        preexec_fn=limit_child if os.name == "posix" else None,
+    )
+    try:
+        stdout, stderr = process.communicate(input=input_text, timeout=timeout)
+    except subprocess.TimeoutExpired as exc:
+        if os.name == "posix":
+            try:
+                os.killpg(process.pid, signal.SIGKILL)
+            except Exception:
+                process.kill()
+        else:
+            process.kill()
+        stdout, stderr = process.communicate()
+        raise subprocess.TimeoutExpired(
+            cmd=cmd,
+            timeout=timeout,
+            output=stdout,
+            stderr=stderr,
+        ) from exc
+
+    completed = subprocess.CompletedProcess(
+        cmd,
+        process.returncode,
+        stdout=stdout,
+        stderr=stderr,
+    )
+    if process.returncode != 0:
+        raise subprocess.CalledProcessError(
+            process.returncode,
+            cmd,
+            output=stdout,
+            stderr=stderr,
+        )
+    return completed
+
+
+def patch_is_empty(metrics: dict[str, Any]) -> bool:
+    return int(metrics.get("changed_files", 0)) == 0
+
+
+def _parse_csv_or_list(raw: Any) -> tuple[str, ...]:
+    if isinstance(raw, list):
+        return tuple(str(item).strip() for item in raw if str(item).strip())
+    if isinstance(raw, str) and raw.strip():
+        return tuple(item.strip() for item in raw.split(",") if item.strip())
+    return ()
+
+
+def evaluation_scale_factors() -> tuple[str, ...]:
+    configured = _parse_csv_or_list(EVALUATION_CONFIG.get("scale_factors"))
+    if configured:
+        raw_values = configured
+    else:
+        raw_values = (PUBLIC_SCALE_FACTOR, *DEFAULT_HIDDEN_SCALE_FACTORS)
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in raw_values:
+        normalized = str(value).strip()
+        if not normalized:
+            continue
+        try:
+            if float(normalized) <= 0:
+                continue
+        except Exception:
+            continue
+        if normalized not in seen:
+            seen.add(normalized)
+            result.append(normalized)
+    return tuple(result) or (PUBLIC_SCALE_FACTOR,)
+
+
+def shuffled(items: list[Any], salt: str) -> list[Any]:
+    rng = random.Random(f"{QUERY_ORDER_SEED}:{salt}")
+    rng.shuffle(items)
+    return items
+
+
+def restore_prebuilt_source(source_dir: Path, env: dict[str, str]) -> bool:
+    """Undo the previous submission while preserving ignored prebuilt artifacts."""
+    status_before = run_checked(
+        ["git", "status", "--porcelain"],
+        cwd=source_dir,
+        env=env,
+        timeout=60,
+    ).stdout
+    run_checked(["git", "reset", "--hard", "HEAD"], cwd=source_dir, env=env, timeout=60)
+    clean = run_checked(["git", "clean", "-fd", "src"], cwd=source_dir, env=env, timeout=60).stdout
+    return bool(status_before.strip()) or bool(clean.strip())
+
+
+def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
+    build_env = dict(env)
+    build_env["GEN"] = "ninja"
+    build_env["BUILD_BENCHMARK"] = "1"
+    build_env["BUILD_EXTENSIONS"] = "tpch"
+    build_env.setdefault("BUILD_UNITTESTS", "0")
+    build_env.setdefault("DISABLE_UNITY", "1")
+    build_env.setdefault("DISABLE_PARQUET", "1")
+    build_env.setdefault("BUILD_JEMALLOC", "0")
+    build_env.setdefault("CMAKE_BUILD_PARALLEL_LEVEL", "1")
+    run_checked(
+        [
+            "cmake",
+            "--build",
+            "build/release",
+            "--config",
+            "Release",
+            "--target",
+            "duckdb",
+            "benchmark_runner",
+        ],
+        cwd=source_dir,
+        env=build_env,
+        timeout=BUILD_TIMEOUT_SECONDS,
+    )
+
+
+def benchmark_runner(source_dir: Path) -> Path:
+    runner = source_dir / BENCHMARK_RUNNER_REL
+    if not runner.exists():
+        raise FileNotFoundError(f"benchmark_runner not found at {runner}")
+    return runner
+
+
+def duckdb_shell(source_dir: Path) -> Path:
+    shell = source_dir / DUCKDB_SHELL_REL
+    if not shell.exists():
+        raise FileNotFoundError(f"DuckDB shell not found at {shell}")
+    return shell
+
+
+def benchmark_env(base_env: dict[str, str], tmp_root: Path) -> dict[str, str]:
+    env = dict(base_env)
+    env["DUCKDB_BENCHMARK_MEMORY_LIMIT"] = DUCKDB_MEMORY_LIMIT
+    env["DUCKDB_BENCHMARK_TEMP_LIMIT"] = DUCKDB_TEMP_LIMIT
+    env["DUCKDB_BENCHMARK_TEMP_DIR"] = str(tmp_root / "duckdb_tmp")
+    return env
+
+
+def settings_sql(temp_dir: Path) -> str:
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    return "\n".join(
+        (
+            "SET threads = 1;",
+            f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';",
+            f"SET max_temp_directory_size = '{DUCKDB_TEMP_LIMIT}';",
+            f"SET temp_directory = '{temp_dir.as_posix()}';",
+            "SET preserve_insertion_order = false;",
+        )
+    )
+
+
+def run_duckdb_sql(
+    shell: Path,
+    database: Path,
+    sql: str,
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    timeout: int,
+) -> str:
+    proc = run_checked(
+        [str(shell), str(database), "-csv", "-c", sql],
+        cwd=cwd,
+        env=env,
+        timeout=timeout,
+    )
+    return proc.stdout
+
+
+def prepare_tpch_database(
+    shell: Path,
+    database: Path,
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    tmp_root: Path,
+    scale_factor: str,
+) -> None:
+    sql = "\n".join(
+        (
+            settings_sql(tmp_root / "duckdb_tmp"),
+            "LOAD tpch;",
+            f"CALL dbgen(sf = {scale_factor});",
+        )
+    )
+    run_duckdb_sql(shell, database, sql, cwd=cwd, env=env, timeout=QUERY_TIMEOUT_SECONDS)
+
+
+def correctness_queries() -> tuple[int, ...]:
+    raw = EVALUATION_CONFIG.get("correctness_queries")
+    if isinstance(raw, list):
+        return tuple(int(item) for item in raw)
+    if isinstance(raw, str) and raw.strip():
+        return tuple(int(item.strip()) for item in raw.split(",") if item.strip())
+    return DEFAULT_CORRECTNESS_QUERIES
+
+
+def output_digest(text: str) -> str:
+    normalized = "\n".join(line.rstrip() for line in text.splitlines())
+    return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
+
+
+def check_correctness(
+    vanilla_source: Path,
+    patched_source: Path,
+    *,
+    env: dict[str, str],
+    tmp_root: Path,
+    scale_factor: str,
+    queries: tuple[int, ...],
+) -> dict[str, str]:
+    vanilla_shell = duckdb_shell(vanilla_source)
+    patched_shell = duckdb_shell(patched_source)
+    scale_label = scale_factor.replace(".", "_")
+    vanilla_db = tmp_root / f"vanilla_tpch_sf{scale_label}.duckdb"
+    patched_db = tmp_root / f"patched_tpch_sf{scale_label}.duckdb"
+    prepare_tpch_database(
+        vanilla_shell,
+        vanilla_db,
+        cwd=vanilla_source,
+        env=env,
+        tmp_root=tmp_root / f"vanilla_setup_sf{scale_label}",
+        scale_factor=scale_factor,
+    )
+    prepare_tpch_database(
+        patched_shell,
+        patched_db,
+        cwd=patched_source,
+        env=env,
+        tmp_root=tmp_root / f"patched_setup_sf{scale_label}",
+        scale_factor=scale_factor,
+    )
+
+    mismatches: dict[str, str] = {}
+    for query_nr in queries:
+        query_sql = "\n".join(
+            (
+                settings_sql(tmp_root / f"query_{query_nr}_tmp"),
+                "LOAD tpch;",
+                f"PRAGMA tpch({query_nr});",
+            )
+        )
+        vanilla_out = run_duckdb_sql(
+            vanilla_shell,
+            vanilla_db,
+            query_sql,
+            cwd=vanilla_source,
+            env=env,
+            timeout=QUERY_TIMEOUT_SECONDS,
+        )
+        patched_out = run_duckdb_sql(
+            patched_shell,
+            patched_db,
+            query_sql,
+            cwd=patched_source,
+            env=env,
+            timeout=QUERY_TIMEOUT_SECONDS,
+        )
+        if output_digest(vanilla_out) != output_digest(patched_out):
+            mismatches[f"sf{scale_factor}:q{query_nr:02d}"] = "result differs from vanilla DuckDB"
+    return mismatches
+
+
+def run_benchmark(
+    runner: Path,
+    benchmark: str,
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    tmp_root: Path,
+) -> float:
+    tmp_root.mkdir(parents=True, exist_ok=True)
+    out_file = tmp_root / (benchmark.replace("/", "_") + ".out")
+    command = [
+        str(runner),
+        benchmark,
+        f"--out={out_file}",
+    ]
+    run_checked(command, cwd=cwd, env=env, timeout=QUERY_TIMEOUT_SECONDS)
+    timings = []
+    for raw in out_file.read_text(encoding="utf-8").splitlines():
+        raw = raw.strip()
+        if not raw:
+            continue
+        if raw.upper() == "ERROR":
+            raise RuntimeError(f"benchmark_runner reported ERROR for {benchmark}")
+        timings.append(float(raw))
+    if not timings:
+        raise RuntimeError(f"no timings produced for {benchmark}")
+    return min(timings)
+
+
+def sanitize_error_text(text: str) -> str:
+    text = re.sub(r"/tmp/[A-Za-z0-9_./-]+", "<tmp>", text)
+    text = re.sub(r"duckdb_[A-Za-z0-9_./-]+", "duckdb_<redacted>", text)
+    text = re.sub(r"sf[0-9]+(?:_[0-9]+)?", "sf<hidden>", text)
+    text = re.sub(r"q[0-9]{1,2}(?=\\.benchmark|\\b)", "q<hidden>", text)
+    return text[-800:]
+
+
+def query_number_from_benchmark(benchmark: str) -> int:
+    match = re.search(r"q(\d{1,2})\.benchmark$", benchmark)
+    if not match:
+        raise ValueError(f"could not infer TPC-H query number from {benchmark}")
+    query_nr = int(match.group(1))
+    if query_nr < 1 or query_nr > 22:
+        raise ValueError(f"TPC-H query number out of range in {benchmark}")
+    return query_nr
+
+
+def run_shell_tpch_timing(
+    shell: Path,
+    database: Path,
+    query_nr: int,
+    *,
+    cwd: Path,
+    env: dict[str, str],
+    tmp_root: Path,
+    repetitions: int,
+) -> float:
+    timings: list[float] = []
+    sql = "\n".join(
+        (
+            settings_sql(tmp_root / f"query_{query_nr}_tmp"),
+            "LOAD tpch;",
+            f"PRAGMA tpch({query_nr});",
+        )
+    )
+    for _ in range(max(1, repetitions)):
+        start = time.perf_counter()
+        run_duckdb_sql(shell, database, sql, cwd=cwd, env=env, timeout=QUERY_TIMEOUT_SECONDS)
+        timings.append(time.perf_counter() - start)
+    return min(timings)
+
+
+def run_paired_shell_tpch_timing(
+    vanilla_shell: Path,
+    patched_shell: Path,
+    vanilla_database: Path,
+    patched_database: Path,
+    query_nr: int,
+    *,
+    vanilla_source: Path,
+    patched_source: Path,
+    env: dict[str, str],
+    tmp_root: Path,
+    repetitions: int,
+    patched_first: bool,
+) -> tuple[float, float]:
+    vanilla_timings: list[float] = []
+    patched_timings: list[float] = []
+    sql = "\n".join(
+        (
+            settings_sql(tmp_root / f"query_{query_nr}_tmp"),
+            "LOAD tpch;",
+            f"PRAGMA tpch({query_nr});",
+        )
+    )
+    order = ("patched", "vanilla") if patched_first else ("vanilla", "patched")
+    for _ in range(max(1, repetitions)):
+        for engine in order:
+            if engine == "vanilla":
+                start = time.perf_counter()
+                run_duckdb_sql(
+                    vanilla_shell,
+                    vanilla_database,
+                    sql,
+                    cwd=vanilla_source,
+                    env=env,
+                    timeout=QUERY_TIMEOUT_SECONDS,
+                )
+                vanilla_timings.append(time.perf_counter() - start)
+            else:
+                start = time.perf_counter()
+                run_duckdb_sql(
+                    patched_shell,
+                    patched_database,
+                    sql,
+                    cwd=patched_source,
+                    env=env,
+                    timeout=QUERY_TIMEOUT_SECONDS,
+                )
+                patched_timings.append(time.perf_counter() - start)
+        order = tuple(reversed(order))
+    return min(vanilla_timings), min(patched_timings)
+
+
+def load_benchmark_list() -> tuple[str, ...]:
+    raw = EVALUATION_CONFIG.get("benchmarks")
+    if isinstance(raw, list):
+        return tuple(str(item) for item in raw)
+    if isinstance(raw, str) and raw.strip():
+        return tuple(item.strip() for item in raw.split(",") if item.strip())
+    sf_label = str(PUBLIC_SCALE_FACTOR).replace(".", "_")
+    return tuple(f"benchmark/tpch/sf{sf_label}/q{index:02d}.benchmark" for index in range(1, 23))
+
+
+def benchmark_query_numbers() -> tuple[int, ...]:
+    query_numbers: list[int] = []
+    for benchmark in load_benchmark_list():
+        query_nr = query_number_from_benchmark(benchmark)
+        if query_nr not in query_numbers:
+            query_numbers.append(query_nr)
+    return tuple(query_numbers)
+
+
+def benchmark_name(scale_factor: str, query_nr: int) -> str:
+    sf_label = str(scale_factor).replace(".", "_")
+    return f"benchmark/tpch/sf{sf_label}/q{query_nr:02d}.benchmark"
+
+
+def geometric_mean(values: list[float]) -> float:
+    if not values:
+        return 0.0
+    return math.exp(sum(math.log(max(value, 1e-9)) for value in values) / len(values))
+
+
+def score_from_speedup(speedup: float) -> float:
+    if speedup <= 0:
+        return 0.0
+    raw = 100.0 * math.log(speedup, 2)
+    return max(0.0, min(100.0, raw))
+
+
+def safe_failed_command(cmd: Any) -> str:
+    if not isinstance(cmd, list) or not cmd:
+        return "subprocess"
+    executable = Path(str(cmd[0])).name
+    if executable == "git":
+        return " ".join(str(part) for part in cmd[:3])
+    if executable == "cmake":
+        return "cmake build"
+    if executable in {"duckdb", "benchmark_runner"}:
+        return executable
+    return executable or "subprocess"
+
+
+def safe_exception(exc: Exception) -> str:
+    if isinstance(exc, (FileNotFoundError, RuntimeError, ValueError)):
+        return sanitize_error_text(str(exc))
+    return type(exc).__name__
+
+
+def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
+    scale_factors = evaluation_scale_factors()
+    benchmark_queries = benchmark_query_numbers()
+    correctness_query_set = correctness_queries()
+    correctness_cases = shuffled(
+        [(scale_factor, query_nr) for scale_factor in scale_factors for query_nr in correctness_query_set],
+        "correctness",
+    )
+    timing_cases = shuffled(
+        [(scale_factor, query_nr) for scale_factor in scale_factors for query_nr in benchmark_queries],
+        "timing",
+    )
+    repetitions = _config_int("FRONTIER_DUCKDB_REPETITIONS", "benchmark_repetitions", 3)
+    if not DEFAULT_CLEAN_SOURCE.exists():
+        metrics["full_benchmark"] = 0
+        return (
+            1.0,
+            1.0,
+            "patch policy smoke passed; DuckDB clean source is not configured in this environment",
+            metrics,
+        )
+
+    with tempfile.TemporaryDirectory(prefix="duckdb_e2e_query_opt_eval_") as tmp:
+        tmp_root = Path(tmp)
+        env = clean_env(tmp_root)
+        patched_source = DEFAULT_CLEAN_SOURCE
+        restored_source = restore_prebuilt_source(patched_source, env)
+        if patch_is_empty(metrics):
+            metrics["used_prebuilt_empty_patch"] = 1
+            if restored_source:
+                metrics["rebuilt_after_source_restore"] = 1
+                build_duckdb(patched_source, env)
+        else:
+            run_checked(["git", "apply", "--check", str(patch_path)], cwd=patched_source, env=env, timeout=60)
+            run_checked(["git", "apply", str(patch_path)], cwd=patched_source, env=env, timeout=60)
+            build_duckdb(patched_source, env)
+
+        vanilla_source = DEFAULT_VANILLA_SOURCE if DEFAULT_VANILLA_SOURCE.exists() else DEFAULT_CLEAN_SOURCE
+        mismatches: dict[str, str] = {}
+        for scale_factor in scale_factors:
+            scale_queries = tuple(query_nr for current_scale, query_nr in correctness_cases if current_scale == scale_factor)
+            mismatches.update(
+                check_correctness(
+                    vanilla_source,
+                    patched_source,
+                    env=env,
+                    tmp_root=tmp_root / "correctness",
+                    scale_factor=scale_factor,
+                    queries=scale_queries,
+                )
+            )
+        metrics["scale_factor_count"] = len(scale_factors)
+        metrics["correctness_queries"] = len(correctness_query_set)
+        metrics["correctness_case_count"] = len(correctness_cases)
+        if mismatches:
+            metrics["correctness_mismatch_count"] = len(mismatches)
+            return _invalid("patched DuckDB produced incorrect TPC-H results", metrics)
+
+        if USE_BENCHMARK_RUNNER:
+            vanilla_runner = benchmark_runner(vanilla_source)
+            patched_runner = benchmark_runner(patched_source)
+            use_benchmark_runner = True
+        else:
+            vanilla_runner = patched_runner = None
+            use_benchmark_runner = False
+        bench_env = benchmark_env(env, tmp_root)
+        if use_benchmark_runner:
+            metrics["timing_harness"] = "benchmark_runner"
+        else:
+            metrics["timing_harness"] = "duckdb_shell_tpch"
+            vanilla_shell = duckdb_shell(vanilla_source)
+            patched_shell = duckdb_shell(patched_source)
+            timing_dbs: dict[str, tuple[Path, Path]] = {}
+            for scale_factor in scale_factors:
+                scale_label = scale_factor.replace(".", "_")
+                vanilla_timing_db = tmp_root / f"vanilla_timing_sf{scale_label}.duckdb"
+                patched_timing_db = tmp_root / f"patched_timing_sf{scale_label}.duckdb"
+                prepare_tpch_database(
+                    vanilla_shell,
+                    vanilla_timing_db,
+                    cwd=vanilla_source,
+                    env=env,
+                    tmp_root=tmp_root / f"vanilla_timing_setup_sf{scale_label}",
+                    scale_factor=scale_factor,
+                )
+                prepare_tpch_database(
+                    patched_shell,
+                    patched_timing_db,
+                    cwd=patched_source,
+                    env=env,
+                    tmp_root=tmp_root / f"patched_timing_setup_sf{scale_label}",
+                    scale_factor=scale_factor,
+                )
+                timing_dbs[scale_factor] = (vanilla_timing_db, patched_timing_db)
+
+        speedups: list[float] = []
+        per_query: dict[str, dict[str, float]] = {}
+        for scale_factor, query_nr in timing_cases:
+            benchmark = benchmark_name(scale_factor, query_nr)
+            if use_benchmark_runner:
+                assert vanilla_runner is not None
+                assert patched_runner is not None
+                vanilla_time = run_benchmark(
+                    vanilla_runner,
+                    benchmark,
+                    cwd=vanilla_source,
+                    env=bench_env,
+                    tmp_root=tmp_root / "vanilla",
+                )
+                patched_time = run_benchmark(
+                    patched_runner,
+                    benchmark,
+                    cwd=patched_source,
+                    env=bench_env,
+                    tmp_root=tmp_root / "patched",
+                )
+            else:
+                vanilla_timing_db, patched_timing_db = timing_dbs[scale_factor]
+                vanilla_time, patched_time = run_paired_shell_tpch_timing(
+                    vanilla_shell,
+                    patched_shell,
+                    vanilla_timing_db,
+                    patched_timing_db,
+                    query_nr,
+                    vanilla_source=vanilla_source,
+                    patched_source=patched_source,
+                    env=bench_env,
+                    tmp_root=tmp_root / "paired_shell",
+                    repetitions=repetitions,
+                    patched_first=random.Random(f"{QUERY_ORDER_SEED}:pair:{scale_factor}:{query_nr}").choice(
+                        [False, True]
+                    ),
+                )
+            speedup = vanilla_time / patched_time if patched_time > 0 else 0.01
+            speedups.append(max(speedup, 0.01))
+            per_query[benchmark] = {
+                "vanilla_time": vanilla_time,
+                "patched_time": patched_time,
+                "speedup": speedup,
+            }
+
+        gm_speedup = geometric_mean(speedups)
+        bounded = score_from_speedup(gm_speedup)
+        metrics.update(
+            {
+                "full_benchmark": 1,
+                "benchmark_count": len(timing_cases),
+                "geomean_speedup": gm_speedup,
+            }
+        )
+        if _config_bool("FRONTIER_DUCKDB_EXPOSE_PER_QUERY", "expose_per_query_metrics", False):
+            metrics["per_query"] = per_query
+        return (
+            bounded,
+            bounded,
+            f"TPC-H geomean speedup {gm_speedup:.4f}x over vanilla DuckDB",
+            metrics,
+        )
+
+
+def evaluate(solution_path: str) -> tuple[float, float, str, dict[str, Any]]:
+    patch_path = Path(solution_path)
+    ok, message, metrics = validate_patch(patch_path)
+    if not ok:
+        return _invalid(message, metrics)
+    try:
+        return full_evaluation(patch_path, metrics)
+    except subprocess.TimeoutExpired:
+        return _invalid("DuckDB build or benchmark timed out", metrics)
+    except subprocess.CalledProcessError as exc:
+        failed_command = safe_failed_command(exc.cmd)
+        metrics["failed_command"] = failed_command
+        if failed_command.startswith("git "):
+            metrics["stderr_tail"] = sanitize_error_text(exc.stderr or "")
+        return _invalid("DuckDB build, patch apply, or benchmark command failed", metrics)
+    except Exception as exc:
+        metrics["error_type"] = type(exc).__name__
+        metrics["error_detail"] = safe_exception(exc)
+        return _invalid("evaluation failed", metrics)
+
+
+def main(argv: list[str]) -> int:
+    if len(argv) != 2:
+        print("Usage: evaluator.py SOLUTION_PATCH", file=sys.stderr)
+        return 2
+    score, score_unbounded, message, metrics = evaluate(argv[1])
+    print(
+        json.dumps(
+            {
+                "score": score,
+                "score_unbounded": score_unbounded,
+                "message": message,
+                "metrics": metrics,
+            },
+            indent=2,
+            sort_keys=True,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv))
diff --git a/2.0/problems/duckdb_e2e_query_optimization/harbor/app/README.md b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/README.md
new file mode 100644
index 00000000..1fbd114f
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/README.md
@@ -0,0 +1,40 @@
+# DuckDB E2E Query Optimization Starter
+
+The workspace is expected to contain a pinned DuckDB checkout at:
+
+```text
+/app/duckdb
+```
+
+Modify DuckDB source code to improve end-to-end TPC-H performance while
+preserving correctness. Submit a patch against the clean checkout:
+
+```bash
+bash /app/make_submission.sh
+bash /app/submit.sh
+```
+
+Submit early. The judge queue is asynchronous, and a full local DuckDB build in
+the agent container can take long enough to exhaust the agent budget. It is
+better to enqueue a small buildable-looking patch first, then keep compiling,
+reviewing, or iterating while the judge scores that snapshot.
+
+The judge applies `/app/solution.patch` to a clean DuckDB tree, builds DuckDB
+with the TPC-H extension enabled, checks correctness, and scores relative
+speedup against vanilla DuckDB.
+
+The public workload family is DuckDB's built-in TPC-H q1 through q22 query set.
+In normal DuckDB development workflows these queries can be inspected or run
+through the TPC-H extension, including `CALL dbgen(...)` and `PRAGMA tpch(n)`.
+Use this as workload context, not as a license to hard-code table names, query
+numbers, or benchmark details.
+
+The Harbor budget is 8 vCPUs, 16 GiB memory, and 32 GiB storage. DuckDB query
+runs are single-threaded and use fixed judge settings: `memory_limit='6GB'`,
+`max_temp_directory_size='2GB'`, an isolated temp directory, and
+`preserve_insertion_order=false`. Build and query subprocesses run in a clean
+environment rather than inheriting terminal or Harbor environment variables.
+
+Only optimizer, join, filter, and narrowly related planning/execution changes
+are valid. New C++ files are allowed if the build-system diff only wires those
+files into existing DuckDB targets.
diff --git a/2.0/problems/duckdb_e2e_query_optimization/harbor/app/make_submission.sh b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/make_submission.sh
new file mode 100755
index 00000000..fe2092ad
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/make_submission.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+DUCKDB_DIR="${DUCKDB_DIR:-/app/duckdb}"
+OUT="${1:-/app/solution.patch}"
+
+if [[ ! -d "$DUCKDB_DIR/.git" ]]; then
+  echo "DuckDB checkout not found at $DUCKDB_DIR" >&2
+  exit 2
+fi
+
+git -C "$DUCKDB_DIR" diff --binary > "$OUT"
+bytes=$(wc -c < "$OUT" | tr -d ' ')
+echo "Wrote $OUT ($bytes bytes)"
diff --git a/2.0/problems/duckdb_e2e_query_optimization/harbor/app/solution.patch b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/solution.patch
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/harbor/app/solution.patch
@@ -0,0 +1 @@
+
diff --git a/2.0/problems/duckdb_e2e_query_optimization/readme b/2.0/problems/duckdb_e2e_query_optimization/readme
new file mode 100644
index 00000000..d6d9cefa
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/readme
@@ -0,0 +1,195 @@
+# DuckDB E2E Query Optimization
+
+## Problem
+
+This is an experimental systems task. You are given a pinned DuckDB checkout in
+the Harbor workspace and may modify DuckDB itself. Your goal is to improve
+end-to-end TPC-H style analytical query performance while preserving the
+correctness and generality of DuckDB's SQL execution.
+
+The intended optimization area is end-to-end analytical query optimization:
+join ordering, predicate transfer, join-side filtering, cardinality robustness,
+and closely related optimizer/execution wiring. Strong submissions should
+improve TPC-H geometric mean runtime without hard-coding TPC-H tables, queries,
+benchmark paths, or judge details.
+
+## Workload
+
+The public workload family is DuckDB's built-in TPC-H query set. The public
+scale factor is 1, and the experimental judge evaluates q1 through q22 with
+DuckDB's TPC-H extension, for example by generating data with `CALL dbgen(...)`
+and running queries through `PRAGMA tpch(n)`. DuckDB also exposes its TPC-H
+query text through the extension in normal DuckDB development workflows.
+
+Treat those queries as a representative analytical workload, not as strings to
+recognize. The judge may include additional non-public scale-factor groups, and
+it may vary query order, repetitions, and correctness coverage. Submissions
+should implement general optimizer and execution improvements rather than
+TPC-H-specific special cases.
+
+## Submission
+
+The submitted artifact is a patch file:
+
+```text
+/app/solution.patch
+```
+
+The agent workspace is expected to contain a DuckDB checkout at:
+
+```text
+/app/duckdb
+```
+
+After modifying DuckDB, generate and submit a patch:
+
+```bash
+bash /app/make_submission.sh
+bash /app/submit.sh
+```
+
+Submissions are asynchronous. For this task, submit an initial small, plausible
+patch as soon as it is generated, then continue local review or compilation
+while the judge works. A full local DuckDB build in the agent container can
+consume most of the agent budget and is not required before the first
+submission; the judge builds the submitted patch from a clean prebuilt source
+tree with fixed resource settings.
+
+The judge applies the patch to a clean pinned DuckDB source tree, builds DuckDB
+with the TPC-H extension enabled, and times the evaluated TPC-H queries from
+the judge side. Submitted binaries, build artifacts, generated benchmark files,
+and local timing logs are ignored.
+
+## Correctness
+
+Correctness is a gate. Patched DuckDB must produce the same SQL results as
+vanilla DuckDB on the evaluated workload. Build failures, patch failures,
+incorrect query results, crashes, timeouts, and out-of-memory failures are
+penalized before performance is considered.
+
+The experimental evaluator currently encodes the patch policy, timing
+orchestration, and vanilla-vs-patched TPC-H result comparison inside the custom
+judge image. DuckDB's broader SQLLogicTest/unit-test tooling is a natural future
+hardening step, but the current score path already gates on query results before
+timing is considered.
+
+## Scoring
+
+Valid submissions are scored by speedup relative to vanilla DuckDB on the same
+hardware, same benchmark list, same clean environment, and same resource limits.
+For each benchmark query:
+
+```text
+speedup = vanilla_time / patched_time
+```
+
+The primary objective is the geometric mean of per-query speedups. The bounded
+score is derived from that geometric mean so that a 1.0x result earns 0 points,
+regressions also earn 0 points, and broad speedups are preferred over a single
+large outlier. The raw geometric mean speedup is reported in evaluator metrics.
+
+## Patch Policy
+
+The evaluator validates the patch before building. The policy is intentionally
+stricter than normal DuckDB development because this task is graded by hidden
+benchmarks.
+
+Allowed optimizer and execution areas:
+
+```text
+src/optimizer/**
+src/include/duckdb/optimizer/**
+src/execution/operator/join/**
+src/include/duckdb/execution/operator/join/**
+src/execution/operator/filter/**
+src/include/duckdb/execution/operator/filter/**
+src/planner/operator/logical_join.cpp
+src/planner/operator/logical_comparison_join.cpp
+src/include/duckdb/planner/operator/logical_join.hpp
+src/include/duckdb/planner/operator/logical_comparison_join.hpp
+```
+
+Conditionally allowed narrow wiring areas:
+
+```text
+src/planner/**
+src/include/duckdb/planner/**
+src/execution/physical_plan/**
+src/include/duckdb/execution/physical_plan/**
+src/common/**
+src/include/duckdb/common/**
+```
+
+New C++ and header files are allowed in these areas. Build-system edits are
+allowed only to add newly introduced `.cpp` files to existing DuckDB build
+targets. Build changes that alter compiler flags, link flags, targets,
+generated code, install paths, external dependencies, benchmark behavior, or
+runtime paths are invalid.
+
+Allowed build-system files:
+
+```text
+CMakeLists.txt
+src/CMakeLists.txt
+src/optimizer/CMakeLists.txt
+src/optimizer/**/CMakeLists.txt
+src/planner/CMakeLists.txt
+src/planner/**/CMakeLists.txt
+src/planner/operator/CMakeLists.txt
+src/common/CMakeLists.txt
+src/common/**/CMakeLists.txt
+src/execution/CMakeLists.txt
+src/execution/operator/CMakeLists.txt
+src/execution/operator/join/CMakeLists.txt
+src/execution/operator/join/**/CMakeLists.txt
+src/execution/operator/filter/CMakeLists.txt
+src/execution/operator/filter/**/CMakeLists.txt
+src/execution/physical_plan/CMakeLists.txt
+src/execution/physical_plan/**/CMakeLists.txt
+extension_config.cmake
+```
+
+Forbidden areas include benchmark files, tests, scripts, TPC-H extension code,
+third-party code, parser/catalog/storage internals, DuckDB main/client context,
+documentation, examples, CI files, package manifests, and timing/scoring code.
+
+Patches may not add environment-variable reads or writes. The judge builds and
+runs both vanilla and patched DuckDB under a minimal fixed environment with no
+Frontier, Harbor, judge, secret, or submission-specific environment variables
+inherited by benchmark subprocesses.
+
+Patches also may not hard-code TPC-H table names, query numbers, benchmark
+names, or benchmark paths in optimizer/execution code.
+
+## Resource Budget
+
+The experimental Harbor budget is:
+
+```text
+vCPUs: 8
+memory: 16 GiB
+storage: 32 GiB
+build timeout: 7200 seconds
+query timeout: 300 seconds
+DuckDB child process address-space limit: 12288 MiB
+```
+
+The judge runs each build and benchmark step in a subprocess with a clean
+environment. DuckDB benchmark runs are configured with fixed thread, memory,
+and temporary-directory limits in the judge image to reduce the chance that a
+bad plan crashes the judge service.
+
+The evaluator runs both vanilla and patched DuckDB with the same SQL runtime
+settings before data generation, correctness checks, and timing:
+
+```sql
+SET threads = 1;
+SET memory_limit = '6GB';
+SET max_temp_directory_size = '2GB';
+SET temp_directory = '<judge temp directory>';
+SET preserve_insertion_order = false;
+```
+
+Subprocesses receive a minimal fixed environment with only basic execution
+paths, locale settings, an isolated home directory, an isolated temp directory,
+and an isolated ccache directory.
diff --git a/2.0/problems/duckdb_e2e_query_optimization/reference.patch b/2.0/problems/duckdb_e2e_query_optimization/reference.patch
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/reference.patch
@@ -0,0 +1 @@
+
diff --git a/2.0/problems/duckdb_e2e_query_optimization/reference.py b/2.0/problems/duckdb_e2e_query_optimization/reference.py
new file mode 100644
index 00000000..1db9ed65
--- /dev/null
+++ b/2.0/problems/duckdb_e2e_query_optimization/reference.py
@@ -0,0 +1,6 @@
+"""Reference placeholder for the experimental DuckDB E2E query optimization task.
+
+The Harbor task submits /app/solution.patch. This Python file exists so the
+Frontier-CS 2.0 task layout remains conventional; the valid baseline patch is
+stored in reference.patch.
+"""