FrontierCS · joyemang33 · Jun 5, 2026 · Jun 5, 2026
diff --git a/2.0/problems/duckdb_e2e_query_optimization/config.yaml b/2.0/problems/duckdb_e2e_query_optimization/config.yaml
@@ -36,9 +36,12 @@ environment:
   build_timeout_seconds: 7200
 evaluation:
   scale_factor: 1
+  agent_scale_factors:
+    - 1
   benchmark_repetitions: 3
   build_timeout_seconds: 7200
   query_timeout_seconds: 300
+  sqllogictest_timeout_seconds: 600
   duckdb_memory_limit: "6GB"
   duckdb_temp_limit: "2GB"
   child_memory_mb: 12288

diff --git a/2.0/problems/duckdb_e2e_query_optimization/evaluator.py b/2.0/problems/duckdb_e2e_query_optimization/evaluator.py
@@ -63,6 +63,11 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:
 
 BUILD_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_BUILD_TIMEOUT", "build_timeout_seconds", 3600)
 QUERY_TIMEOUT_SECONDS = _config_int("FRONTIER_DUCKDB_QUERY_TIMEOUT", "query_timeout_seconds", 300)
+SQLLOGICTEST_TIMEOUT_SECONDS = _config_int(
+    "FRONTIER_DUCKDB_SQLLOGICTEST_TIMEOUT",
+    "sqllogictest_timeout_seconds",
+    600,
+)
 DUCKDB_MEMORY_LIMIT = _config_str("FRONTIER_DUCKDB_MEMORY_LIMIT", "duckdb_memory_limit", "6GB")
 DUCKDB_TEMP_LIMIT = _config_str("FRONTIER_DUCKDB_TEMP_LIMIT", "duckdb_temp_limit", "2GB")
 CHILD_MEMORY_LIMIT_MB = _config_int("FRONTIER_DUCKDB_CHILD_MEMORY_MB", "child_memory_mb", 12288)
@@ -76,7 +81,15 @@ def _config_bool(env_name: str, config_name: str, default: bool) -> bool:
 
 BENCHMARK_RUNNER_REL = Path("build/release/benchmark/benchmark_runner")
 DUCKDB_SHELL_REL = Path("build/release/duckdb")
+DUCKDB_UNITTEST_REL = Path("build/release/test/unittest")
 DEFAULT_CORRECTNESS_QUERIES = tuple(range(1, 23))
+FINAL_SQLLOGICTEST_FILES = (
+    "test/sql/join/inner/test_join.test",
+    "test/sql/join/inner/test_inner_join_filter_pushdown.test",
+    "test/sql/join/semianti/semijoin.test",
+    "test/sql/filter/test_transitive_filters.test",
+    "test/sql/aggregate/distinct/test_distinct.test",
+)
 
 STRONGLY_ALLOWED_PATTERNS = (
     "src/optimizer/**",
@@ -480,12 +493,15 @@ def _parse_csv_or_list(raw: Any) -> tuple[str, ...]:
     return ()
 
 
-def evaluation_scale_factors() -> tuple[str, ...]:
-    configured = _parse_csv_or_list(EVALUATION_CONFIG.get("scale_factors"))
+def evaluation_scale_factors(*, final_role: bool) -> tuple[str, ...]:
+    config_key = "scale_factors" if final_role else "agent_scale_factors"
+    configured = _parse_csv_or_list(EVALUATION_CONFIG.get(config_key))
     if configured:
         raw_values = configured
-    else:
+    elif final_role:
         raw_values = (PUBLIC_SCALE_FACTOR, *DEFAULT_HIDDEN_SCALE_FACTORS)
+    else:
+        raw_values = (PUBLIC_SCALE_FACTOR,)
     seen: set[str] = set()
     result: list[str] = []
     for value in raw_values:
@@ -522,16 +538,19 @@ def restore_prebuilt_source(source_dir: Path, env: dict[str, str]) -> bool:
     return bool(status_before.strip()) or bool(clean.strip())
 
 
-def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
+def build_duckdb(source_dir: Path, env: dict[str, str], *, include_unittest: bool = False) -> None:
     build_env = dict(env)
     build_env["GEN"] = "ninja"
     build_env["BUILD_BENCHMARK"] = "1"
     build_env["BUILD_EXTENSIONS"] = "tpch"
-    build_env.setdefault("BUILD_UNITTESTS", "0")
+    build_env.setdefault("BUILD_UNITTESTS", "1" if include_unittest else "0")
     build_env.setdefault("DISABLE_UNITY", "1")
     build_env.setdefault("DISABLE_PARQUET", "1")
     build_env.setdefault("BUILD_JEMALLOC", "0")
     build_env.setdefault("CMAKE_BUILD_PARALLEL_LEVEL", "1")
+    targets = ["duckdb", "benchmark_runner"]
+    if include_unittest:
+        targets.append("unittest")
     run_checked(
         [
             "cmake",
@@ -540,8 +559,7 @@ def build_duckdb(source_dir: Path, env: dict[str, str]) -> None:
             "--config",
             "Release",
             "--target",
-            "duckdb",
-            "benchmark_runner",
+            *targets,
         ],
         cwd=source_dir,
         env=build_env,
@@ -563,6 +581,13 @@ def duckdb_shell(source_dir: Path) -> Path:
     return shell
 
 
+def duckdb_unittest(source_dir: Path) -> Path:
+    runner = source_dir / DUCKDB_UNITTEST_REL
+    if not runner.exists():
+        raise FileNotFoundError(f"DuckDB unittest runner not found at {runner}")
+    return runner
+
+
 def benchmark_env(base_env: dict[str, str], tmp_root: Path) -> dict[str, str]:
     env = dict(base_env)
     env["DUCKDB_BENCHMARK_MEMORY_LIMIT"] = DUCKDB_MEMORY_LIMIT
@@ -584,6 +609,30 @@ def settings_sql(temp_dir: Path) -> str:
     )
 
 
+def is_final_submission_role() -> bool:
+    return os.environ.get("FRONTIER_SUBMISSION_ROLE", "agent") == "final"
+
+
+def run_final_sqllogictest_smoke(
+    source_dir: Path,
+    *,
+    env: dict[str, str],
+    metrics: dict[str, Any],
+) -> None:
+    runner = duckdb_unittest(source_dir)
+    start = time.perf_counter()
+    for test_file in FINAL_SQLLOGICTEST_FILES:
+        run_checked(
+            [str(runner), test_file],
+            cwd=source_dir,
+            env=env,
+            timeout=SQLLOGICTEST_TIMEOUT_SECONDS,
+        )
+    metrics["final_sqllogictest"] = 1
+    metrics["final_sqllogictest_count"] = len(FINAL_SQLLOGICTEST_FILES)
+    metrics["final_sqllogictest_seconds"] = round(time.perf_counter() - start, 3)
+
+
 def run_duckdb_sql(
     shell: Path,
     database: Path,
@@ -865,6 +914,8 @@ def safe_failed_command(cmd: Any) -> str:
         return " ".join(str(part) for part in cmd[:3])
     if executable == "cmake":
         return "cmake build"
+    if executable == "unittest":
+        return "duckdb sqllogictest"
     if executable in {"duckdb", "benchmark_runner"}:
         return executable
     return executable or "subprocess"
@@ -877,7 +928,9 @@ def safe_exception(exc: Exception) -> str:
 
 
 def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
-    scale_factors = evaluation_scale_factors()
+    final_role = is_final_submission_role()
+    metrics["submission_role"] = "final" if final_role else "agent"
+    scale_factors = evaluation_scale_factors(final_role=final_role)
     benchmark_queries = benchmark_query_numbers()
     correctness_query_set = correctness_queries()
     correctness_cases = shuffled(
@@ -907,11 +960,13 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
             metrics["used_prebuilt_empty_patch"] = 1
             if restored_source:
                 metrics["rebuilt_after_source_restore"] = 1
-                build_duckdb(patched_source, env)
+                build_duckdb(patched_source, env, include_unittest=final_role)
+            elif final_role:
+                build_duckdb(patched_source, env, include_unittest=True)
         else:
             run_checked(["git", "apply", "--check", str(patch_path)], cwd=patched_source, env=env, timeout=60)
             run_checked(["git", "apply", str(patch_path)], cwd=patched_source, env=env, timeout=60)
-            build_duckdb(patched_source, env)
+            build_duckdb(patched_source, env, include_unittest=final_role)
 
         vanilla_source = DEFAULT_VANILLA_SOURCE if DEFAULT_VANILLA_SOURCE.exists() else DEFAULT_CLEAN_SOURCE
         mismatches: dict[str, str] = {}
@@ -934,6 +989,9 @@ def full_evaluation(patch_path: Path, metrics: dict[str, Any]):
             metrics["correctness_mismatch_count"] = len(mismatches)
             return _invalid("patched DuckDB produced incorrect TPC-H results", metrics)
 
+        if final_role:
+            run_final_sqllogictest_smoke(patched_source, env=env, metrics=metrics)
+
         if USE_BENCHMARK_RUNNER:
             vanilla_runner = benchmark_runner(vanilla_source)
             patched_runner = benchmark_runner(patched_source)

diff --git a/2.0/problems/duckdb_e2e_query_optimization/readme b/2.0/problems/duckdb_e2e_query_optimization/readme
@@ -69,8 +69,11 @@ penalized before performance is considered.
 
 The experimental evaluator currently encodes the patch policy, timing
 orchestration, and vanilla-vs-patched TPC-H result comparison inside the custom
-judge image. DuckDB's broader SQLLogicTest/unit-test tooling is a natural future
-hardening step, but the current score path already gates on query results before
+judge image. During iterative asynchronous submissions, the judge keeps feedback
+focused on the public scale-factor TPC-H gate so agents can submit early and
+continue working while evaluation runs. During final verification, the judge
+uses the broader hidden scale-factor set and also runs a small DuckDB
+SQLLogicTest smoke set covering join, filter, and aggregate behavior before
 timing is considered.
 
 ## Scoring

diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py
@@ -15,6 +15,7 @@
 import traceback
 import threading
 import secrets
+import uuid
 from collections import deque
 from datetime import datetime, timezone
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
@@ -291,6 +292,19 @@ def evaluate_archive(archive_b64: str, *, submission_role: str = "agent") -> dic
         return evaluate_path(root, submission_role=submission_role)
 
 
+def load_best_submission_payload(submission_uuid: str) -> tuple[dict[str, Any], str]:
+    if not BEST_SUBMISSION_PAYLOAD.exists():
+        raise FileNotFoundError("best submission payload not found")
+    payload = json.loads(BEST_SUBMISSION_PAYLOAD.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError("best submission payload is invalid")
+    payload = dict(payload)
+    payload["submission_role"] = "final"
+    payload["submission_uuid"] = submission_uuid
+    submission_kind = str(payload.get("submission_kind", "file"))
+    return payload, submission_kind
+
+
 def validate_payload(payload: dict[str, Any], *, allow_final: bool, role_token: str = "") -> tuple[str, str, str]:
     submission_uuid = str(payload.get("submission_uuid") or "")
     if not submission_uuid:
@@ -432,6 +446,9 @@ def do_POST(self) -> None:
             submission_uuid = parsed.path.removeprefix("/submission/").removesuffix("/cancel").strip("/")
             self.handle_cancel(submission_uuid)
             return
+        if parsed.path == "/evaluate_best":
+            self.handle_evaluate_best()
+            return
         if parsed.path != "/evaluate":
             self._write_json(404, {"status": "error", "error": "not found"})
             return
@@ -513,6 +530,61 @@ def read_json_body(self) -> dict[str, Any]:
             raise ValueError("request body must be a JSON object")
         return payload
 
+    def handle_evaluate_best(self) -> None:
+        if not READY:
+            self._write_json(
+                503,
+                {
+                    "status": "error",
+                    "score": 0.0,
+                    "score_unbounded": 0.0,
+                    "message": "judge is not ready",
+                    "health": READY_PAYLOAD,
+                },
+            )
+            return
+
+        submission_uuid = ""
+        submission_kind = "file"
+        try:
+            request_payload = self.read_json_body()
+            if request_payload.get("submission_role") != "final":
+                raise PermissionError("best-submission rerun must use final role")
+            if not secrets.compare_digest(
+                self.headers.get("X-Frontier-CS-Role-Token", ""),
+                FINAL_ROLE_TOKEN,
+            ):
+                raise PermissionError("final evaluation role is verifier-only")
+            submission_uuid = str(request_payload.get("submission_uuid") or uuid.uuid4())
+            payload, submission_kind = load_best_submission_payload(submission_uuid)
+            result = run_payload(payload, submission_role="final")
+            log_submission(
+                {
+                    "submission_uuid": submission_uuid,
+                    "submission_role": "final",
+                    "submission_kind": submission_kind,
+                    **result,
+                }
+            )
+            self._write_json(200, result)
+        except Exception:
+            print(traceback.format_exc(), flush=True)
+            result = {
+                "status": "error",
+                "score": 0.0,
+                "score_unbounded": 0.0,
+                "message": "best iterative evaluation failed",
+            }
+            log_submission(
+                {
+                    "submission_uuid": submission_uuid,
+                    "submission_role": "final",
+                    "submission_kind": submission_kind,
+                    **result,
+                }
+            )
+            self._write_json(200, result)
+
     def handle_submit(self) -> None:
         if not READY:
             self._write_json(503, {"status": "error", "error": "judge is not ready", "health": READY_PAYLOAD})