Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion src/programbench/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,53 @@

import logging
import subprocess
import tempfile
import time
import uuid
from pathlib import Path
from typing import Any
from typing import Any, Protocol

from programbench.constants import DOCKER_CP_TIMEOUT, DOCKER_RUN_TIMEOUT

log = logging.getLogger(__name__)


class Environment(Protocol):
"""The minimal surface ``Evaluator`` needs from a per-task container.

Implementations don't have to inherit from this — duck-typed protocols
are fine — but you do have to provide every method below. ``cwd`` and
``default_timeout`` are read directly by ``Evaluator`` so they need to
be regular attributes.
"""

cwd: str
default_timeout: int

def execute(self, command: str, *, timeout: int | None = None) -> dict[str, Any]: ...
def copy_in(self, local_path: Path, container_path: str) -> None: ...
def copy_in_tar(self, tar_path: Path, container_path: str) -> None: ...
def copy_out(self, container_path: str, *, timeout: int = 60) -> tuple[str, str]: ...
def commit(self, image_ref: str) -> str: ...
def cleanup(self) -> None: ...


class ContainerBackend(Protocol):
"""Factory that produces ``Environment`` instances for the Evaluator.

The Evaluator asks the backend for one ``Environment`` per phase
(compile, then one per test branch). The default implementation
(:class:`DockerBackend`) returns :class:`ContainerEnvironment` —
i.e., real Docker containers. Custom backends can swap in
non-Docker isolation (firecracker VMs, gVisor sandboxes, no
isolation at all if you trust the input, etc.).
"""

def new_env(self, image: str, *, cwd: str, timeout: int, cpus: int,
env: dict[str, str] | None, run_args: list[str] | None) -> Environment: ...
def remove_image(self, image_ref: str) -> None: ...


class ContainerEnvironment:
"""Manage a long-running container for command execution and file injection."""

Expand Down Expand Up @@ -116,6 +154,32 @@ def copy_in(self, local_path: Path, container_path: str) -> None:
if result.returncode != 0:
raise RuntimeError(f"docker cp failed: {result.stderr.strip()}")

def copy_out(self, container_path: str, *, timeout: int = 60) -> tuple[str, str]:
"""Read a file out of the container via ``docker cp``.

Returns ``(contents, command_string)``. The command string is
included so callers (notably :meth:`Evaluator._copy_file_from_container`)
can record it in their step logs without having to know the
backend's wire format. Raises ``RuntimeError`` with the
underlying stderr (or ``"... timed out after Ns"``) on failure.

Bypasses bash so login-shell stderr (``mesg: ttyname failed`` etc.)
can't pollute the bytes the way ``cat <file>`` would.
"""
host_tmp = Path(tempfile.mkstemp(suffix=Path(container_path).suffix or ".out")[1])
cmd_list = [self.executable, "cp", f"{self.container_id}:{container_path}", str(host_tmp)]
cmd_str = " ".join(cmd_list)
try:
try:
cp = subprocess.run(cmd_list, capture_output=True, text=True, timeout=timeout)
except subprocess.TimeoutExpired:
raise RuntimeError(f"docker cp timed out after {timeout}s")
if cp.returncode != 0:
raise RuntimeError((cp.stdout + cp.stderr).strip())
return host_tmp.read_text(), cmd_str
finally:
host_tmp.unlink(missing_ok=True)

def copy_in_tar(self, tar_path: Path, container_path: str) -> None:
"""Stream an on-disk tar(.gz) into the container via the container's tar.

Expand Down Expand Up @@ -206,3 +270,38 @@ def remove_image(image_ref: str, *, executable: str = "docker") -> None:
)
except Exception:
pass


class DockerBackend:
"""The default ``ContainerBackend``: spawns real Docker containers.

Drop-in for any callsite that previously instantiated
:class:`ContainerEnvironment` directly — same args, same behavior.
"""

def __init__(self, *, executable: str = "docker", run_args: list[str] | None = None):
self.executable = executable
self.default_run_args = list(run_args or [])

def new_env(
self,
image: str,
*,
cwd: str = "/",
timeout: int = 30,
cpus: int = 10,
env: dict[str, str] | None = None,
run_args: list[str] | None = None,
) -> ContainerEnvironment:
return ContainerEnvironment(
image=image,
cwd=cwd,
executable=self.executable,
timeout=timeout,
cpus=cpus,
env=env,
run_args=list(self.default_run_args) + list(run_args or []),
)

def remove_image(self, image_ref: str) -> None:
remove_image(image_ref, executable=self.executable)
83 changes: 34 additions & 49 deletions src/programbench/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
"""

import logging
import subprocess
import tempfile
import threading
import time
import uuid
Expand All @@ -40,7 +38,7 @@
DOCKER_RUN_ARGS,
WORKSPACE_DIR,
)
from programbench.container import ContainerEnvironment, remove_image
from programbench.container import ContainerBackend, ContainerEnvironment, DockerBackend
from programbench.exceptions import EmptyTestResultError, EvalStepError, XmlParseError

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -295,6 +293,7 @@ def __init__(
docker_cpus: int = DOCKER_CPUS,
branch_workers: int = 1,
branch_retries: int = 1,
backend: ContainerBackend | None = None,
):
self.image_name = image_name
self.solution_branch = solution_branch
Expand All @@ -310,6 +309,7 @@ def __init__(
self.docker_cpus = docker_cpus
self.branch_workers = max(1, branch_workers)
self.branch_retries = max(0, branch_retries)
self.backend = backend or DockerBackend(executable=DOCKER_EXECUTABLE, run_args=list(DOCKER_RUN_ARGS))
self._has_rerunfailures = False
self._log_lock = threading.Lock()
self._from_existing = from_existing
Expand Down Expand Up @@ -394,52 +394,38 @@ def _copy_file_from_container(
step_name: str,
timeout: int = 60,
) -> str:
"""Copy a file out of the container via ``docker cp`` and return its contents.
"""Copy a file out of the container and return its contents.

Bypasses bash so login-shell stderr (``mesg: ttyname failed`` etc.) can't
pollute the bytes the way ``cat <file>`` would. Logs to ``log_buf`` with
the same shape as ``_run_step``; on success the entry's ``output`` holds
the file contents so ``from_existing`` replay keeps working.
Delegates to ``env.copy_out``; the backend decides how to fetch the
bytes (Docker's ``docker cp``, a non-Docker backend's filesystem
read, etc.). Logs to ``log_buf`` with the same shape as
``_run_step``; on success the entry's ``output`` holds the file
contents so ``from_existing`` replay keeps working.
"""
host_tmp = Path(tempfile.mkstemp(suffix=Path(container_path).suffix or ".out")[1])
cmd_list = [env.executable, "cp", f"{env.container_id}:{container_path}", str(host_tmp)]
cmd_str = " ".join(cmd_list)
log.debug("Running step: %s", cmd_str)
t0 = time.monotonic()
try:
try:
cp = subprocess.run(cmd_list, capture_output=True, text=True, timeout=timeout)
rc = cp.returncode
err = (cp.stdout + cp.stderr).strip()
except subprocess.TimeoutExpired:
rc, err = -1, f"docker cp timed out after {timeout}s"
contents, cmd_str = env.copy_out(container_path, timeout=timeout)
except RuntimeError as exc:
wall_time = time.monotonic() - t0
if rc != 0:
log_buf.append(
{
"step": step_name,
"command": cmd_str,
"wall_time": wall_time,
"output": err,
"returncode": rc,
"exception_info": "",
}
)
raise EvalStepError(f"{step_name}_failed", err)
contents = host_tmp.read_text()
log_buf.append(
{
"step": step_name,
"command": cmd_str,
"wall_time": wall_time,
"output": contents,
"returncode": 0,
"exception_info": "",
}
)
return contents
finally:
host_tmp.unlink(missing_ok=True)
log_buf.append({
"step": step_name,
"command": f"copy_out({container_path})",
"wall_time": wall_time,
"output": str(exc),
"returncode": -1,
"exception_info": "",
})
raise EvalStepError(f"{step_name}_failed", str(exc))
wall_time = time.monotonic() - t0
log_buf.append({
"step": step_name,
"command": cmd_str,
"wall_time": wall_time,
"output": contents,
"returncode": 0,
"exception_info": "",
})
return contents

def _new_env(self, image: str, *, serial_pytest: bool = False) -> ContainerEnvironment:
# Baseline xdist hardening that always works (xdist ships with
Expand All @@ -456,14 +442,13 @@ def _new_env(self, image: str, *, serial_pytest: bool = False) -> ContainerEnvir
env = {"PYTEST_ADDOPTS": addopts}
if serial_pytest:
env["PYTEST_XDIST_AUTO_NUM_WORKERS"] = "1"
return ContainerEnvironment(
image=image,
return self.backend.new_env(
image,
cwd=WORKSPACE_DIR,
executable=DOCKER_EXECUTABLE,
timeout=600,
cpus=self.docker_cpus,
env=env,
run_args=[*DOCKER_RUN_ARGS, "--init"],
run_args=["--init"],
)

def _remove_hashed_files(self, env: ContainerEnvironment, log_buf: list[dict]) -> None:
Expand Down Expand Up @@ -839,7 +824,7 @@ def run(self) -> EvaluationResult:
if compile_env is not None:
compile_env.cleanup()
if committed_image is not None:
remove_image(committed_image, executable=DOCKER_EXECUTABLE)
self.backend.remove_image(committed_image)


def parse_test_results(results_xml: str, branch: str = "") -> EvaluationResult:
Expand Down
119 changes: 119 additions & 0 deletions tests/test_pluggable_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""Tests for the pluggable ContainerBackend / Environment abstraction.

The point of this PR is that downstream callers shouldn't have to monkey-
patch internals to run the eval against a non-Docker isolation primitive
(e.g. Daytona sandboxes, gVisor, firecracker, or just a bare host shell).
These tests exercise the substitution: build a fake in-memory backend,
pass it to the Evaluator, and verify the eval calls the fake's methods
instead of shelling Docker.
"""

from pathlib import Path

from programbench.container import ContainerBackend, DockerBackend
from programbench.eval.eval import Evaluator


class FakeEnv:
"""Records every method call so the test can assert the surface."""

def __init__(self, *, image, cwd, timeout, cpus, env, run_args):
self.image = image
self.cwd = cwd
self.default_timeout = timeout
self.cpus = cpus
self.env = env
self.run_args = run_args
self.calls: list[tuple] = []
self.canned_copy_out: dict[str, str] = {}

def execute(self, command, *, timeout=None):
self.calls.append(("execute", command))
return {"output": "", "returncode": 0, "exception_info": ""}

def copy_in(self, local_path, container_path):
self.calls.append(("copy_in", str(local_path), container_path))

def copy_in_tar(self, tar_path, container_path):
self.calls.append(("copy_in_tar", str(tar_path), container_path))

def copy_out(self, container_path, *, timeout=60):
self.calls.append(("copy_out", container_path))
contents = self.canned_copy_out.get(container_path, "")
return contents, f"fake-cp {container_path}"

def commit(self, image_ref):
self.calls.append(("commit", image_ref))
return image_ref

def cleanup(self):
self.calls.append(("cleanup",))


class FakeBackend:
"""Hands out FakeEnv instances and records image removals."""

def __init__(self):
self.envs: list[FakeEnv] = []
self.removed: list[str] = []

def new_env(self, image, *, cwd, timeout, cpus, env, run_args):
env_obj = FakeEnv(image=image, cwd=cwd, timeout=timeout,
cpus=cpus, env=env, run_args=run_args)
self.envs.append(env_obj)
return env_obj

def remove_image(self, image_ref):
self.removed.append(image_ref)


def test_evaluator_uses_injected_backend():
"""Constructing an Evaluator with a custom backend routes all container
operations through it — _new_env calls backend.new_env, and
_copy_file_from_container calls env.copy_out."""
backend = FakeBackend()
e = Evaluator(
tests_branches=["abc123"],
image_name="example/example.deadbeef",
backend=backend,
)

env = e._new_env("example/example.deadbeef:task")
assert isinstance(env, FakeEnv)
assert env.image == "example/example.deadbeef:task"
assert env.cwd # WORKSPACE_DIR
assert len(backend.envs) == 1

log_buf: list[dict] = []
env.canned_copy_out["/workspace/eval/results.xml"] = "<xml/>"
contents = e._copy_file_from_container(
env=env, log_buf=log_buf,
container_path="/workspace/eval/results.xml",
step_name="results_read",
)
assert contents == "<xml/>"
assert ("copy_out", "/workspace/eval/results.xml") in env.calls
# Step was logged with the env's reported command-string, not "docker cp".
assert log_buf[0]["step"] == "results_read"
assert log_buf[0]["command"].startswith("fake-cp ")


def test_evaluator_default_backend_is_docker():
"""No `backend=` kwarg → DockerBackend (preserves pre-PR behavior)."""
e = Evaluator(tests_branches=[], image_name="foo")
assert isinstance(e.backend, DockerBackend)


def test_docker_backend_satisfies_protocol():
"""DockerBackend implements the ContainerBackend protocol surface."""
backend: ContainerBackend = DockerBackend()
# Protocol membership is structural — this assignment compiles only
# because DockerBackend has the required methods. Smoke-touch them.
assert callable(backend.new_env)
assert callable(backend.remove_image)
Loading