facebookresearch · frmsaul · May 16, 2026
diff --git a/src/programbench/container.py b/src/programbench/container.py
@@ -6,15 +6,53 @@
 
 import logging
 import subprocess
+import tempfile
+import time
 import uuid
 from pathlib import Path
-from typing import Any
+from typing import Any, Protocol
 
 from programbench.constants import DOCKER_CP_TIMEOUT, DOCKER_RUN_TIMEOUT
 
 log = logging.getLogger(__name__)
 
 
+class Environment(Protocol):
+    """The minimal surface ``Evaluator`` needs from a per-task container.
+
+    Implementations don't have to inherit from this — duck-typed protocols
+    are fine — but you do have to provide every method below. ``cwd`` and
+    ``default_timeout`` are read directly by ``Evaluator`` so they need to
+    be regular attributes.
+    """
+
+    cwd: str
+    default_timeout: int
+
+    def execute(self, command: str, *, timeout: int | None = None) -> dict[str, Any]: ...
+    def copy_in(self, local_path: Path, container_path: str) -> None: ...
+    def copy_in_tar(self, tar_path: Path, container_path: str) -> None: ...
+    def copy_out(self, container_path: str, *, timeout: int = 60) -> tuple[str, str]: ...
+    def commit(self, image_ref: str) -> str: ...
+    def cleanup(self) -> None: ...
+
+
+class ContainerBackend(Protocol):
+    """Factory that produces ``Environment`` instances for the Evaluator.
+
+    The Evaluator asks the backend for one ``Environment`` per phase
+    (compile, then one per test branch). The default implementation
+    (:class:`DockerBackend`) returns :class:`ContainerEnvironment` —
+    i.e., real Docker containers. Custom backends can swap in
+    non-Docker isolation (firecracker VMs, gVisor sandboxes, no
+    isolation at all if you trust the input, etc.).
+    """
+
+    def new_env(self, image: str, *, cwd: str, timeout: int, cpus: int,
+                env: dict[str, str] | None, run_args: list[str] | None) -> Environment: ...
+    def remove_image(self, image_ref: str) -> None: ...
+
+
 class ContainerEnvironment:
     """Manage a long-running container for command execution and file injection."""
 
@@ -116,6 +154,32 @@ def copy_in(self, local_path: Path, container_path: str) -> None:
         if result.returncode != 0:
             raise RuntimeError(f"docker cp failed: {result.stderr.strip()}")
 
+    def copy_out(self, container_path: str, *, timeout: int = 60) -> tuple[str, str]:
+        """Read a file out of the container via ``docker cp``.
+
+        Returns ``(contents, command_string)``. The command string is
+        included so callers (notably :meth:`Evaluator._copy_file_from_container`)
+        can record it in their step logs without having to know the
+        backend's wire format. Raises ``RuntimeError`` with the
+        underlying stderr (or ``"... timed out after Ns"``) on failure.
+
+        Bypasses bash so login-shell stderr (``mesg: ttyname failed`` etc.)
+        can't pollute the bytes the way ``cat <file>`` would.
+        """
+        host_tmp = Path(tempfile.mkstemp(suffix=Path(container_path).suffix or ".out")[1])
+        cmd_list = [self.executable, "cp", f"{self.container_id}:{container_path}", str(host_tmp)]
+        cmd_str = " ".join(cmd_list)
+        try:
+            try:
+                cp = subprocess.run(cmd_list, capture_output=True, text=True, timeout=timeout)
+            except subprocess.TimeoutExpired:
+                raise RuntimeError(f"docker cp timed out after {timeout}s")
+            if cp.returncode != 0:
+                raise RuntimeError((cp.stdout + cp.stderr).strip())
+            return host_tmp.read_text(), cmd_str
+        finally:
+            host_tmp.unlink(missing_ok=True)
+
     def copy_in_tar(self, tar_path: Path, container_path: str) -> None:
         """Stream an on-disk tar(.gz) into the container via the container's tar.
 
@@ -206,3 +270,38 @@ def remove_image(image_ref: str, *, executable: str = "docker") -> None:
         )
     except Exception:
         pass
+
+
+class DockerBackend:
+    """The default ``ContainerBackend``: spawns real Docker containers.
+
+    Drop-in for any callsite that previously instantiated
+    :class:`ContainerEnvironment` directly — same args, same behavior.
+    """
+
+    def __init__(self, *, executable: str = "docker", run_args: list[str] | None = None):
+        self.executable = executable
+        self.default_run_args = list(run_args or [])
+
+    def new_env(
+        self,
+        image: str,
+        *,
+        cwd: str = "/",
+        timeout: int = 30,
+        cpus: int = 10,
+        env: dict[str, str] | None = None,
+        run_args: list[str] | None = None,
+    ) -> ContainerEnvironment:
+        return ContainerEnvironment(
+            image=image,
+            cwd=cwd,
+            executable=self.executable,
+            timeout=timeout,
+            cpus=cpus,
+            env=env,
+            run_args=list(self.default_run_args) + list(run_args or []),
+        )
+
+    def remove_image(self, image_ref: str) -> None:
+        remove_image(image_ref, executable=self.executable)
diff --git a/src/programbench/eval/eval.py b/src/programbench/eval/eval.py
@@ -20,8 +20,6 @@
 """
 
 import logging
-import subprocess
-import tempfile
 import threading
 import time
 import uuid
@@ -40,7 +38,7 @@
     DOCKER_RUN_ARGS,
     WORKSPACE_DIR,
 )
-from programbench.container import ContainerEnvironment, remove_image
+from programbench.container import ContainerBackend, ContainerEnvironment, DockerBackend
 from programbench.exceptions import EmptyTestResultError, EvalStepError, XmlParseError
 
 log = logging.getLogger(__name__)
@@ -295,6 +293,7 @@ def __init__(
         docker_cpus: int = DOCKER_CPUS,
         branch_workers: int = 1,
         branch_retries: int = 1,
+        backend: ContainerBackend | None = None,
     ):
         self.image_name = image_name
         self.solution_branch = solution_branch
@@ -310,6 +309,7 @@ def __init__(
         self.docker_cpus = docker_cpus
         self.branch_workers = max(1, branch_workers)
         self.branch_retries = max(0, branch_retries)
+        self.backend = backend or DockerBackend(executable=DOCKER_EXECUTABLE, run_args=list(DOCKER_RUN_ARGS))
         self._has_rerunfailures = False
         self._log_lock = threading.Lock()
         self._from_existing = from_existing
@@ -394,52 +394,38 @@ def _copy_file_from_container(
         step_name: str,
         timeout: int = 60,
     ) -> str:
-        """Copy a file out of the container via ``docker cp`` and return its contents.
+        """Copy a file out of the container and return its contents.
 
-        Bypasses bash so login-shell stderr (``mesg: ttyname failed`` etc.) can't
-        pollute the bytes the way ``cat <file>`` would. Logs to ``log_buf`` with
-        the same shape as ``_run_step``; on success the entry's ``output`` holds
-        the file contents so ``from_existing`` replay keeps working.
+        Delegates to ``env.copy_out``; the backend decides how to fetch the
+        bytes (Docker's ``docker cp``, a non-Docker backend's filesystem
+        read, etc.). Logs to ``log_buf`` with the same shape as
+        ``_run_step``; on success the entry's ``output`` holds the file
+        contents so ``from_existing`` replay keeps working.
         """
-        host_tmp = Path(tempfile.mkstemp(suffix=Path(container_path).suffix or ".out")[1])
-        cmd_list = [env.executable, "cp", f"{env.container_id}:{container_path}", str(host_tmp)]
-        cmd_str = " ".join(cmd_list)
-        log.debug("Running step: %s", cmd_str)
         t0 = time.monotonic()
         try:
-            try:
-                cp = subprocess.run(cmd_list, capture_output=True, text=True, timeout=timeout)
-                rc = cp.returncode
-                err = (cp.stdout + cp.stderr).strip()
-            except subprocess.TimeoutExpired:
-                rc, err = -1, f"docker cp timed out after {timeout}s"
+            contents, cmd_str = env.copy_out(container_path, timeout=timeout)
+        except RuntimeError as exc:
             wall_time = time.monotonic() - t0
-            if rc != 0:
-                log_buf.append(
-                    {
-                        "step": step_name,
-                        "command": cmd_str,
-                        "wall_time": wall_time,
-                        "output": err,
-                        "returncode": rc,
-                        "exception_info": "",
-                    }
-                )
-                raise EvalStepError(f"{step_name}_failed", err)
-            contents = host_tmp.read_text()
-            log_buf.append(
-                {
-                    "step": step_name,
-                    "command": cmd_str,
-                    "wall_time": wall_time,
-                    "output": contents,
-                    "returncode": 0,
-                    "exception_info": "",
-                }
-            )
-            return contents
-        finally:
-            host_tmp.unlink(missing_ok=True)
+            log_buf.append({
+                "step": step_name,
+                "command": f"copy_out({container_path})",
+                "wall_time": wall_time,
+                "output": str(exc),
+                "returncode": -1,
+                "exception_info": "",
+            })
+            raise EvalStepError(f"{step_name}_failed", str(exc))
+        wall_time = time.monotonic() - t0
+        log_buf.append({
+            "step": step_name,
+            "command": cmd_str,
+            "wall_time": wall_time,
+            "output": contents,
+            "returncode": 0,
+            "exception_info": "",
+        })
+        return contents
 
     def _new_env(self, image: str, *, serial_pytest: bool = False) -> ContainerEnvironment:
         # Baseline xdist hardening that always works (xdist ships with
@@ -456,14 +442,13 @@ def _new_env(self, image: str, *, serial_pytest: bool = False) -> ContainerEnvir
         env = {"PYTEST_ADDOPTS": addopts}
         if serial_pytest:
             env["PYTEST_XDIST_AUTO_NUM_WORKERS"] = "1"
-        return ContainerEnvironment(
-            image=image,
+        return self.backend.new_env(
+            image,
             cwd=WORKSPACE_DIR,
-            executable=DOCKER_EXECUTABLE,
             timeout=600,
             cpus=self.docker_cpus,
             env=env,
-            run_args=[*DOCKER_RUN_ARGS, "--init"],
+            run_args=["--init"],
         )
 
     def _remove_hashed_files(self, env: ContainerEnvironment, log_buf: list[dict]) -> None:
@@ -839,7 +824,7 @@ def run(self) -> EvaluationResult:
             if compile_env is not None:
                 compile_env.cleanup()
             if committed_image is not None:
-                remove_image(committed_image, executable=DOCKER_EXECUTABLE)
+                self.backend.remove_image(committed_image)
 
 
 def parse_test_results(results_xml: str, branch: str = "") -> EvaluationResult:

diff --git a/tests/test_pluggable_backend.py b/tests/test_pluggable_backend.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for the pluggable ContainerBackend / Environment abstraction.
+
+The point of this PR is that downstream callers shouldn't have to monkey-
+patch internals to run the eval against a non-Docker isolation primitive
+(e.g. Daytona sandboxes, gVisor, firecracker, or just a bare host shell).
+These tests exercise the substitution: build a fake in-memory backend,
+pass it to the Evaluator, and verify the eval calls the fake's methods
+instead of shelling Docker.
+"""
+
+from pathlib import Path
+
+from programbench.container import ContainerBackend, DockerBackend
+from programbench.eval.eval import Evaluator
+
+
+class FakeEnv:
+    """Records every method call so the test can assert the surface."""
+
+    def __init__(self, *, image, cwd, timeout, cpus, env, run_args):
+        self.image = image
+        self.cwd = cwd
+        self.default_timeout = timeout
+        self.cpus = cpus
+        self.env = env
+        self.run_args = run_args
+        self.calls: list[tuple] = []
+        self.canned_copy_out: dict[str, str] = {}
+
+    def execute(self, command, *, timeout=None):
+        self.calls.append(("execute", command))
+        return {"output": "", "returncode": 0, "exception_info": ""}
+
+    def copy_in(self, local_path, container_path):
+        self.calls.append(("copy_in", str(local_path), container_path))
+
+    def copy_in_tar(self, tar_path, container_path):
+        self.calls.append(("copy_in_tar", str(tar_path), container_path))
+
+    def copy_out(self, container_path, *, timeout=60):
+        self.calls.append(("copy_out", container_path))
+        contents = self.canned_copy_out.get(container_path, "")
+        return contents, f"fake-cp {container_path}"
+
+    def commit(self, image_ref):
+        self.calls.append(("commit", image_ref))
+        return image_ref
+
+    def cleanup(self):
+        self.calls.append(("cleanup",))
+
+
+class FakeBackend:
+    """Hands out FakeEnv instances and records image removals."""
+
+    def __init__(self):
+        self.envs: list[FakeEnv] = []
+        self.removed: list[str] = []
+
+    def new_env(self, image, *, cwd, timeout, cpus, env, run_args):
+        env_obj = FakeEnv(image=image, cwd=cwd, timeout=timeout,
+                          cpus=cpus, env=env, run_args=run_args)
+        self.envs.append(env_obj)
+        return env_obj
+
+    def remove_image(self, image_ref):
+        self.removed.append(image_ref)
+
+
+def test_evaluator_uses_injected_backend():
+    """Constructing an Evaluator with a custom backend routes all container
+    operations through it — _new_env calls backend.new_env, and
+    _copy_file_from_container calls env.copy_out."""
+    backend = FakeBackend()
+    e = Evaluator(
+        tests_branches=["abc123"],
+        image_name="example/example.deadbeef",
+        backend=backend,
+    )
+
+    env = e._new_env("example/example.deadbeef:task")
+    assert isinstance(env, FakeEnv)
+    assert env.image == "example/example.deadbeef:task"
+    assert env.cwd  # WORKSPACE_DIR
+    assert len(backend.envs) == 1
+
+    log_buf: list[dict] = []
+    env.canned_copy_out["/workspace/eval/results.xml"] = "<xml/>"
+    contents = e._copy_file_from_container(
+        env=env, log_buf=log_buf,
+        container_path="/workspace/eval/results.xml",
+        step_name="results_read",
+    )
+    assert contents == "<xml/>"
+    assert ("copy_out", "/workspace/eval/results.xml") in env.calls
+    # Step was logged with the env's reported command-string, not "docker cp".
+    assert log_buf[0]["step"] == "results_read"
+    assert log_buf[0]["command"].startswith("fake-cp ")
+
+
+def test_evaluator_default_backend_is_docker():
+    """No `backend=` kwarg → DockerBackend (preserves pre-PR behavior)."""
+    e = Evaluator(tests_branches=[], image_name="foo")
+    assert isinstance(e.backend, DockerBackend)
+
+
+def test_docker_backend_satisfies_protocol():
+    """DockerBackend implements the ContainerBackend protocol surface."""
+    backend: ContainerBackend = DockerBackend()
+    # Protocol membership is structural — this assignment compiles only
+    # because DockerBackend has the required methods. Smoke-touch them.
+    assert callable(backend.new_env)
+    assert callable(backend.remove_image)