Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions 2.0/problems/duckdb_e2e_query_optimization/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ environment:
build_timeout_seconds: 7200
evaluation:
scale_factor: 1
agent_scale_factors:
- 1
benchmark_repetitions: 3
build_timeout_seconds: 7200
query_timeout_seconds: 300
sqllogictest_timeout_seconds: 600
duckdb_memory_limit: "6GB"
duckdb_temp_limit: "2GB"
child_memory_mb: 12288
Expand Down
78 changes: 68 additions & 10 deletions 2.0/problems/duckdb_e2e_query_optimization/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:

BUILD_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_BUILD_TIMEOUT", "build_timeout_seconds", 3600)
QUERY_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_QUERY_TIMEOUT", "query_timeout_seconds", 300)
SQLLOGICTEST_TIMEOUT_SECONDS = _config_int(
"FRONTIER_DUCKDB_SQLLOGICTEST_TIMEOUT",
"sqllogictest_timeout_seconds",
600,
)
DUCKDB_MEMORY_LIMIT = _config_str("FRONTIER_DUCKDB_MEMORY_LIMIT", "duckdb_memory_limit", "6GB")
DUCKDB_TEMP_LIMIT = _config_str("FRONTIER_DUCKDB_TEMP_LIMIT", "duckdb_temp_limit", "2GB")
CHILD_MEMORY_LIMIT_MB = _config_int("FRONTIER_DUCKDB_CHILD_MEMORY_MB", "child_memory_mb", 12288)
Expand All @@ -76,7 +81,15 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:

BENCHMARK_RUNNER_REL = Path("build/release/benchmark/benchmark_runner")
DUCKDB_SHELL_REL = Path("build/release/duckdb")
DUCKDB_UNITTEST_REL = Path("build/release/test/unittest")
DEFAULT_CORRECTNESS_QUERIES = tuple(range(1, 23))
FINAL_SQLLOGICTEST_FILES = (
"test/sql/join/inner/test_join.test",
"test/sql/join/inner/test_inner_join_filter_pushdown.test",
"test/sql/join/semianti/semijoin.test",
"test/sql/filter/test_transitive_filters.test",
"test/sql/aggregate/distinct/test_distinct.test",
)

STRONGLY_ALLOWED_PATTERNS = (
"src/optimizer/**",
Expand Down Expand Up @@ -480,12 +493,15 @@ def _parse_csv_or_list(raw: Any) -> tuple[str, ...]:
return ()


def evaluation_scale_factors() -> tuple[str, ...]:
configured = _parse_csv_or_list(EVALUATION_CONFIG.get("scale_factors"))
def evaluation_scale_factors(*, final_role: bool) -> tuple[str, ...]:
config_key = "scale_factors" if final_role else "agent_scale_factors"
configured = _parse_csv_or_list(EVALUATION_CONFIG.get(config_key))
if configured:
raw_values = configured
else:
elif final_role:
raw_values = (PUBLIC_SCALE_FACTOR, *DEFAULT_HIDDEN_SCALE_FACTORS)
else:
raw_values = (PUBLIC_SCALE_FACTOR,)
seen: set[str] = set()
result: list[str] = []
for value in raw_values:
Expand Down Expand Up @@ -522,16 +538,19 @@ def restore_prebuilt_source(source_dir: Path, env: dict[str, str]) -> bool:
return bool(status_before.strip()) or bool(clean.strip())


def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
def build_duckdb(source_dir: Path, env: dict[str, str], *, include_unittest: bool = False) -> None:
build_env = dict(env)
build_env["GEN"] = "ninja"
build_env["BUILD_BENCHMARK"] = "1"
build_env["BUILD_EXTENSIONS"] = "tpch"
build_env.setdefault("BUILD_UNITTESTS", "0")
build_env.setdefault("BUILD_UNITTESTS", "1" if include_unittest else "0")
build_env.setdefault("DISABLE_UNITY", "1")
build_env.setdefault("DISABLE_PARQUET", "1")
build_env.setdefault("BUILD_JEMALLOC", "0")
build_env.setdefault("CMAKE_BUILD_PARALLEL_LEVEL", "1")
targets = ["duckdb", "benchmark_runner"]
if include_unittest:
targets.append("unittest")
run_checked(
[
"cmake",
Expand All @@ -540,8 +559,7 @@ def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
"--config",
"Release",
"--target",
"duckdb",
"benchmark_runner",
*targets,
],
cwd=source_dir,
env=build_env,
Expand All @@ -563,6 +581,13 @@ def duckdb_shell(source_dir: Path) -> Path:
return shell


def duckdb_unittest(source_dir: Path) -> Path:
runner = source_dir / DUCKDB_UNITTEST_REL
if not runner.exists():
raise FileNotFoundError(f"DuckDB unittest runner not found at {runner}")
return runner


def benchmark_env(base_env: dict[str, str], tmp_root: Path) -> dict[str, str]:
env = dict(base_env)
env["DUCKDB_BENCHMARK_MEMORY_LIMIT"] = DUCKDB_MEMORY_LIMIT
Expand All @@ -584,6 +609,30 @@ def settings_sql(temp_dir: Path) -> str:
)


def is_final_submission_role() -> bool:
return os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") == "final"


def run_final_sqllogictest_smoke(
source_dir: Path,
*,
env: dict[str, str],
metrics: dict[str, Any],
) -> None:
runner = duckdb_unittest(source_dir)
start = time.perf_counter()
for test_file in FINAL_SQLLOGICTEST_FILES:
run_checked(
[str(runner), test_file],
cwd=source_dir,
env=env,
timeout=SQLLOGICTEST_TIMEOUT_SECONDS,
)
metrics["final_sqllogictest"] = 1
metrics["final_sqllogictest_count"] = len(FINAL_SQLLOGICTEST_FILES)
metrics["final_sqllogictest_seconds"] = round(time.perf_counter() - start, 3)


def run_duckdb_sql(
shell: Path,
database: Path,
Expand Down Expand Up @@ -865,6 +914,8 @@ def safe_failed_command(cmd: Any) -> str:
return " ".join(str(part) for part in cmd[:3])
if executable == "cmake":
return "cmake build"
if executable == "unittest":
return "duckdb sqllogictest"
if executable in {"duckdb", "benchmark_runner"}:
return executable
return executable or "subprocess"
Expand All @@ -877,7 +928,9 @@ def safe_exception(exc: Exception) -> str:


def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
scale_factors = evaluation_scale_factors()
final_role = is_final_submission_role()
metrics["submission_role"] = "final" if final_role else "agent"
scale_factors = evaluation_scale_factors(final_role=final_role)
benchmark_queries = benchmark_query_numbers()
correctness_query_set = correctness_queries()
correctness_cases = shuffled(
Expand Down Expand Up @@ -907,11 +960,13 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
metrics["used_prebuilt_empty_patch"] = 1
if restored_source:
metrics["rebuilt_after_source_restore"] = 1
build_duckdb(patched_source, env)
build_duckdb(patched_source, env, include_unittest=final_role)
elif final_role:
build_duckdb(patched_source, env, include_unittest=True)
else:
run_checked(["git", "apply", "--check", str(patch_path)], cwd=patched_source, env=env, timeout=60)
run_checked(["git", "apply", str(patch_path)], cwd=patched_source, env=env, timeout=60)
build_duckdb(patched_source, env)
build_duckdb(patched_source, env, include_unittest=final_role)

vanilla_source = DEFAULT_VANILLA_SOURCE if DEFAULT_VANILLA_SOURCE.exists() else DEFAULT_CLEAN_SOURCE
mismatches: dict[str, str] = {}
Expand All @@ -934,6 +989,9 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
metrics["correctness_mismatch_count"] = len(mismatches)
return _invalid("patched DuckDB produced incorrect TPC-H results", metrics)

if final_role:
run_final_sqllogictest_smoke(patched_source, env=env, metrics=metrics)

if USE_BENCHMARK_RUNNER:
vanilla_runner = benchmark_runner(vanilla_source)
patched_runner = benchmark_runner(patched_source)
Expand Down
7 changes: 5 additions & 2 deletions 2.0/problems/duckdb_e2e_query_optimization/readme
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,11 @@ penalized before performance is considered.

The experimental evaluator currently encodes the patch policy, timing
orchestration, and vanilla-vs-patched TPC-H result comparison inside the custom
judge image. DuckDB's broader SQLLogicTest/unit-test tooling is a natural future
hardening step, but the current score path already gates on query results before
judge image. During iterative asynchronous submissions, the judge keeps feedback
focused on the public scale-factor TPC-H gate so agents can submit early and
continue working while evaluation runs. During final verification, the judge
uses the broader hidden scale-factor set and also runs a small DuckDB
SQLLogicTest smoke set covering join, filter, and aggregate behavior before
timing is considered.

## Scoring
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import traceback
import threading
import secrets
import uuid
from collections import deque
from datetime import datetime, timezone
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
Expand Down Expand Up @@ -291,6 +292,19 @@ def evaluate_archive(archive_b64: str, *, submission_role: str = "agent") -> dic
return evaluate_path(root, submission_role=submission_role)


def load_best_submission_payload(submission_uuid: str) -> tuple[dict[str, Any], str]:
if not BEST_SUBMISSION_PAYLOAD.exists():
raise FileNotFoundError("best submission payload not found")
payload = json.loads(BEST_SUBMISSION_PAYLOAD.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError("best submission payload is invalid")
payload = dict(payload)
payload["submission_role"] = "final"
payload["submission_uuid"] = submission_uuid
submission_kind = str(payload.get("submission_kind", "file"))
return payload, submission_kind


def validate_payload(payload: dict[str, Any], *, allow_final: bool, role_token: str = "") -> tuple[str, str, str]:
submission_uuid = str(payload.get("submission_uuid") or "")
if not submission_uuid:
Expand Down Expand Up @@ -432,6 +446,9 @@ def do_POST(self) -> None:
submission_uuid = parsed.path.removeprefix("/submission/").removesuffix("/cancel").strip("/")
self.handle_cancel(submission_uuid)
return
if parsed.path == "/evaluate_best":
self.handle_evaluate_best()
return
if parsed.path != "/evaluate":
self._write_json(404, {"status": "error", "error": "not found"})
return
Expand Down Expand Up @@ -513,6 +530,61 @@ def read_json_body(self) -> dict[str, Any]:
raise ValueError("request body must be a JSON object")
return payload

def handle_evaluate_best(self) -> None:
if not READY:
self._write_json(
503,
{
"status": "error",
"score": 0.0,
"score_unbounded": 0.0,
"message": "judge is not ready",
"health": READY_PAYLOAD,
},
)
return

submission_uuid = ""
submission_kind = "file"
try:
request_payload = self.read_json_body()
if request_payload.get("submission_role") != "final":
raise PermissionError("best-submission rerun must use final role")
if not secrets.compare_digest(
self.headers.get("X-Frontier-CS-Role-Token", ""),
FINAL_ROLE_TOKEN,
):
raise PermissionError("final evaluation role is verifier-only")
submission_uuid = str(request_payload.get("submission_uuid") or uuid.uuid4())
payload, submission_kind = load_best_submission_payload(submission_uuid)
result = run_payload(payload, submission_role="final")
log_submission(
{
"submission_uuid": submission_uuid,
"submission_role": "final",
"submission_kind": submission_kind,
**result,
}
)
self._write_json(200, result)
except Exception:
print(traceback.format_exc(), flush=True)
result = {
"status": "error",
"score": 0.0,
"score_unbounded": 0.0,
"message": "best iterative evaluation failed",
}
log_submission(
{
"submission_uuid": submission_uuid,
"submission_role": "final",
"submission_kind": submission_kind,
**result,
}
)
self._write_json(200, result)

def handle_submit(self) -> None:
if not READY:
self._write_json(503, {"status": "error", "error": "judge is not ready", "health": READY_PAYLOAD})
Expand Down
Loading
Loading