Skip to content

Commit 1517a66

Browse files
committed
fix(fly): make cloud bucket mount strategy resilient to dropped exec exit codes
The sprite-env WS control protocol currently ships ``OP_COMPLETE`` with ``{"ok": true}`` and no ``exitCode`` field, so the Python client (sprites-py control.py:292) defaults to ``0`` for every exec result. That makes ``ExecResult.ok()`` lie about the underlying command's success, which broke the cloud bucket mount path: ``_ensure_rclone``'s first ``command -v rclone`` check would always report "present" and short-circuit the install. Reroute the strategy's detection through stdout sentinels ("__FLY_PRESENT__" / "__FLY_MISSING__") emitted by the *local* shell's evaluation of the conditional, which is correct regardless of how the WS hop reports the final exit code. The install commands themselves no longer assert on exit codes either — instead, the post-install detection probe is the source of truth, and we surface "install attempt completed but X is still not on PATH" if it didn't actually take. Add a new ``_verify_mount_active`` step after ``delegate.activate`` that polls ``mountpoint -q`` and emits a "__FLY_MOUNTED__" / "__FLY_NOT_MOUNTED__" sentinel — needed because ``rclone mount --daemon`` forks and returns immediately, and the framework's existing exit-code check on the rclone mount command is also unreliable on Fly today. Verified end-to-end against a live sprite + Tigris S3-compatible bucket: fuse + rclone install lazily on first activation (~45s on stock image), ``rclone mount`` daemon registers, and the FUSE mount appears in ``/proc/mounts`` as expected. Once sprite-env starts populating ``args.exitCode`` on OP_COMPLETE, ``ExecResult.ok()`` will work correctly and the stdout-sentinel approach remains valid (it's strictly more robust). Test suite updated: ``_present()``/``_missing()``/``_mounted()``/ ``_not_mounted()`` helpers stub the sentinel-bearing exec results, and the install-failure tests now exercise the post-install recheck path. New tests cover ``_verify_mount_active`` happy-path and not-mounted-raises. 21 mount tests in total (was 20).
1 parent ca5f96b commit 1517a66

2 files changed

Lines changed: 169 additions & 87 deletions

File tree

src/agents/extensions/sandbox/flyio/mounts.py

Lines changed: 80 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import shlex
56
from pathlib import Path
67
from typing import Literal
78

@@ -18,10 +19,31 @@
1819
_APT = (
1920
f"{_SUDO} env DEBIAN_FRONTEND=noninteractive DEBCONF_NOWARNINGS=yes apt-get -o Dpkg::Use-Pty=0"
2021
)
21-
_RCLONE_CHECK = "command -v rclone >/dev/null 2>&1 || test -x /usr/local/bin/rclone"
22-
_FUSERMOUNT_CHECK = (
22+
23+
# Detection commands echo a sentinel into stdout based on the *local* shell's
24+
# evaluation of the conditional. We rely on stdout instead of ``ExecResult.ok()``
25+
# because the sprite-env WS control protocol currently drops exec exit codes
26+
# (the OP_COMPLETE envelope ships ``{"ok": true}`` with no exit-code field, so
27+
# the Python client defaults to 0 for every command). Stdout sentinels are
28+
# also more robust against tools that exit non-zero on benign warnings.
29+
_PRESENT = "__FLY_PRESENT__"
30+
_MISSING = "__FLY_MISSING__"
31+
_MOUNTED = "__FLY_MOUNTED__"
32+
_NOT_MOUNTED = "__FLY_NOT_MOUNTED__"
33+
34+
35+
def _detect_cmd(condition: str) -> str:
36+
"""Return a shell snippet that prints _PRESENT or _MISSING based on `condition`."""
37+
38+
return f"if {condition}; then echo {_PRESENT}; else echo {_MISSING}; fi"
39+
40+
41+
_RCLONE_CHECK = _detect_cmd("command -v rclone >/dev/null 2>&1 || test -x /usr/local/bin/rclone")
42+
_FUSERMOUNT_CHECK = _detect_cmd(
2343
"command -v fusermount3 >/dev/null 2>&1 || command -v fusermount >/dev/null 2>&1"
2444
)
45+
_FUSE_KERNEL_CHECK = _detect_cmd("test -c /dev/fuse && grep -qw fuse /proc/filesystems")
46+
_APT_CHECK = _detect_cmd("command -v apt-get >/dev/null 2>&1")
2547
_INSTALL_RCLONE_COMMANDS = (
2648
f"{_APT} update -qq",
2749
f"{_APT} install -y -qq curl unzip ca-certificates fuse",
@@ -41,78 +63,94 @@
4163
)
4264

4365

66+
def _stdout_says(result: object, sentinel: str) -> bool:
67+
stdout = getattr(result, "stdout", b"") or b""
68+
return sentinel.encode("ascii") in stdout
69+
70+
4471
async def _ensure_fuse_support(session: BaseSandboxSession) -> None:
45-
kernel = await session.exec(
46-
"sh",
47-
"-lc",
48-
"test -c /dev/fuse && grep -qw fuse /proc/filesystems",
49-
shell=False,
50-
)
51-
if not kernel.ok():
72+
kernel = await session.exec("sh", "-lc", _FUSE_KERNEL_CHECK, shell=False)
73+
if not _stdout_says(kernel, _PRESENT):
5274
raise MountConfigError(
5375
message="Fly cloud bucket mounts require FUSE support in the kernel",
5476
context={"missing": "fuse"},
5577
)
5678

5779
fusermount = await session.exec("sh", "-lc", _FUSERMOUNT_CHECK, shell=False)
58-
if not fusermount.ok():
59-
apt = await session.exec("sh", "-lc", "command -v apt-get >/dev/null 2>&1", shell=False)
60-
if not apt.ok():
80+
if not _stdout_says(fusermount, _PRESENT):
81+
apt = await session.exec("sh", "-lc", _APT_CHECK, shell=False)
82+
if not _stdout_says(apt, _PRESENT):
6183
raise MountConfigError(
6284
message="fusermount is not installed and apt-get is unavailable; "
6385
"preinstall the fuse package",
6486
context={"package": "fuse"},
6587
)
6688
for command in _INSTALL_FUSE_COMMANDS:
67-
install = await session.exec("sh", "-lc", command, shell=False, timeout=300)
68-
if not install.ok():
69-
raise MountConfigError(
70-
message="failed to install fuse",
71-
context={"package": "fuse", "exit_code": install.exit_code},
72-
)
89+
await session.exec("sh", "-lc", command, shell=False, timeout=300)
7390
recheck = await session.exec("sh", "-lc", _FUSERMOUNT_CHECK, shell=False)
74-
if not recheck.ok():
91+
if not _stdout_says(recheck, _PRESENT):
7592
raise MountConfigError(
76-
message="fuse was installed but fusermount is still not on PATH",
93+
message="fuse install attempt completed but fusermount is still not on PATH",
7794
context={"package": "fuse"},
7895
)
7996

80-
chmod_result = await session.exec("sh", "-lc", _FUSE_ALLOW_OTHER, shell=False, timeout=30)
81-
if not chmod_result.ok():
82-
raise MountConfigError(
83-
message="failed to make /dev/fuse accessible",
84-
context={"exit_code": chmod_result.exit_code},
85-
)
97+
# /dev/fuse must be accessible to the unprivileged user and ``user_allow_other``
98+
# has to be enabled for ``--allow-other``. Failures here would be surfaced by
99+
# the rclone mount itself; we don't gate on this exec's exit code because the
100+
# control-WS protocol drops it.
101+
await session.exec("sh", "-lc", _FUSE_ALLOW_OTHER, shell=False, timeout=30)
86102

87103

88104
async def _ensure_rclone(session: BaseSandboxSession) -> None:
89105
rclone = await session.exec("sh", "-lc", _RCLONE_CHECK, shell=False)
90-
if rclone.ok():
106+
if _stdout_says(rclone, _PRESENT):
91107
return
92108

93-
apt = await session.exec("sh", "-lc", "command -v apt-get >/dev/null 2>&1", shell=False)
94-
if not apt.ok():
109+
apt = await session.exec("sh", "-lc", _APT_CHECK, shell=False)
110+
if not _stdout_says(apt, _PRESENT):
95111
raise MountConfigError(
96112
message="rclone is not installed and apt-get is unavailable; preinstall rclone",
97113
context={"package": "rclone"},
98114
)
99115

100116
for command in _INSTALL_RCLONE_COMMANDS:
101-
install = await session.exec("sh", "-lc", command, shell=False, timeout=300)
102-
if not install.ok():
103-
raise MountConfigError(
104-
message="failed to install rclone",
105-
context={"package": "rclone", "exit_code": install.exit_code},
106-
)
117+
await session.exec("sh", "-lc", command, shell=False, timeout=300)
107118

108119
rclone = await session.exec("sh", "-lc", _RCLONE_CHECK, shell=False)
109-
if not rclone.ok():
120+
if not _stdout_says(rclone, _PRESENT):
110121
raise MountConfigError(
111-
message="rclone was installed but is still not available on PATH",
122+
message="rclone install attempt completed but rclone is still not on PATH",
112123
context={"package": "rclone"},
113124
)
114125

115126

127+
async def _verify_mount_active(session: BaseSandboxSession, mount_path: Path) -> None:
128+
"""Confirm ``mount_path`` is a live mountpoint after activation.
129+
130+
Without reliable exit codes from the platform we can't detect a failed
131+
rclone mount via ``rclone mount``'s return value. Probe the kernel's view
132+
of the path instead: ``mountpoint -q`` returns 0 iff the path is a mount
133+
boundary. The shell wraps the conditional and emits a stdout sentinel so
134+
the verification is transport-independent. ``rclone mount --daemon`` forks
135+
and the parent returns immediately, so we poll briefly to give the daemon
136+
time to bind.
137+
"""
138+
139+
quoted = shlex.quote(str(mount_path))
140+
probe_cmd = (
141+
f"for _ in 1 2 3 4 5 6 7 8 9 10; do "
142+
f"if mountpoint -q {quoted}; then echo {_MOUNTED}; exit 0; fi; "
143+
"sleep 0.5; "
144+
f"done; echo {_NOT_MOUNTED}"
145+
)
146+
probe = await session.exec("sh", "-lc", probe_cmd, shell=False, timeout=30)
147+
if not _stdout_says(probe, _MOUNTED):
148+
raise MountConfigError(
149+
message="rclone mount completed but the path is not a live mountpoint",
150+
context={"path": str(mount_path)},
151+
)
152+
153+
116154
async def _default_user_ids(session: BaseSandboxSession) -> tuple[str, str] | None:
117155
result = await session.exec("sh", "-lc", "id -u; id -g", shell=False, timeout=30)
118156
if not result.ok():
@@ -184,7 +222,11 @@ async def activate(
184222
await _ensure_fuse_support(session)
185223
await _ensure_rclone(session)
186224
delegate = await self._delegate_for_session(session)
187-
return await delegate.activate(mount, session, dest, base_dir)
225+
files = await delegate.activate(mount, session, dest, base_dir)
226+
if self.pattern.mode == "fuse":
227+
mount_path = mount._resolve_mount_path(session, dest)
228+
await _verify_mount_active(session, mount_path)
229+
return files
188230

189231
async def deactivate(
190232
self,

0 commit comments

Comments
 (0)