Skip to content

Commit fe957f1

Browse files
committed
Fix WS handshake over-reading and dropping the next protocol frame
Root cause of the long-running CI flake on Windows + Python 3.13 in test_ws_viewer_authenticates_and_receives_frames / test_ws_host_announces_host_id and friends: _read_http_message used recv(1024). On loopback the kernel often coalesces the host's "101 Switching Protocols" reply and the AUTH_CHALLENGE WS frame that follows it into a single TCP segment. The bulk recv consumed both, the function returned only the HTTP text, and the trailing frame bytes were silently dropped. The client's next recv() then blocked the full 60 s auth budget waiting for a frame the kernel had already delivered. This pattern is load-dependent (server has to be fast enough to flush both back-to- back), which is exactly why the failure looked random. Switch to byte-by-byte reads up to "\r\n\r\n" so any post-header bytes stay in the kernel buffer for the next recv. The extra syscalls cost ~1 ms on loopback — well below the WS upgrade itself. Also drop the @pytest.mark.flaky reruns added in 0929195: those were masking this exact bug, and the underlying handshake is now deterministic. Add a regression test that fuses the 101 response and a WS BINARY frame into one sendall and verifies recv_message still returns the frame after client_handshake.
1 parent 9a0924b commit fe957f1

2 files changed

Lines changed: 69 additions & 6 deletions

File tree

je_auto_control/utils/remote_desktop/ws_protocol.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,18 @@ def client_handshake(sock: socket.socket, host: str, port: int,
102102

103103

104104
def _read_http_message(sock: socket.socket) -> str:
105+
# Byte-by-byte read until "\r\n\r\n". A bulk recv(1024) would
106+
# over-read into the next message: when the peer packs the HTTP
107+
# response and the first protocol frame into a single TCP segment
108+
# (common on loopback under load), the post-header bytes end up in
109+
# this buffer and are dropped on return — the next recv() then
110+
# blocks forever on bytes that already arrived. Loopback syscalls
111+
# are microseconds; ~150 of them per handshake is well below the
112+
# noise floor of the WS upgrade itself.
105113
buf = bytearray()
106-
while b"\r\n\r\n" not in buf:
107-
chunk = sock.recv(1024)
114+
terminator = b"\r\n\r\n"
115+
while not buf.endswith(terminator):
116+
chunk = sock.recv(1)
108117
if not chunk:
109118
raise WsProtocolError("connection closed during handshake")
110119
buf.extend(chunk)

test/unit_test/headless/test_remote_desktop_websocket.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,64 @@ def test_recv_handles_extended_payload_length():
8686
client.close()
8787

8888

89+
def test_handshake_does_not_over_read_into_next_frame():
90+
"""Regression: server pack of 101 + first WS frame in one segment.
91+
92+
When the host sends the HTTP upgrade response and the AUTH_CHALLENGE
93+
frame back-to-back, the kernel coalesces them on loopback. A bulk
94+
``recv(1024)`` inside ``client_handshake`` would consume both, then
95+
discard the post-header bytes — the next ``recv_message`` then
96+
blocks forever on data that already arrived. Verify the handshake
97+
leaves any trailing bytes in the kernel buffer.
98+
"""
99+
server, client = _make_socketpair()
100+
try:
101+
import threading
102+
103+
# Drive client_handshake on a thread; on the server side we
104+
# send 101 and a WS BINARY frame in a single sendall to mimic
105+
# the production race that flaked CI.
106+
done = threading.Event()
107+
108+
def client_side():
109+
client_handshake(client, "127.0.0.1", 1234, path="/")
110+
done.set()
111+
112+
thread = threading.Thread(target=client_side, daemon=True)
113+
thread.start()
114+
# Read the GET request, then send 101 + a tiny WS frame fused.
115+
request = b""
116+
while b"\r\n\r\n" not in request:
117+
request += server.recv(1024)
118+
sec_key = ""
119+
for line in request.decode("iso-8859-1").split("\r\n"):
120+
if line.lower().startswith("sec-websocket-key:"):
121+
sec_key = line.split(":", 1)[1].strip()
122+
import base64
123+
import hashlib
124+
accept = base64.b64encode(hashlib.sha1( # nosec B324
125+
(sec_key + "258EAFA5-E914-47DA-95CA-C5AB0DC85B11").encode("ascii"),
126+
usedforsecurity=False,
127+
).digest()).decode("ascii")
128+
response = (
129+
"HTTP/1.1 101 Switching Protocols\r\n"
130+
"Upgrade: websocket\r\n"
131+
"Connection: Upgrade\r\n"
132+
f"Sec-WebSocket-Accept: {accept}\r\n"
133+
"\r\n"
134+
).encode("ascii")
135+
# WS BINARY frame: FIN=1, opcode=2, len=3, payload=b"hi!"
136+
ws_frame = b"\x82\x03hi!"
137+
server.sendall(response + ws_frame)
138+
assert done.wait(timeout=2.0)
139+
# The frame must still be readable — i.e. handshake didn't
140+
# swallow the bytes that followed "\r\n\r\n".
141+
assert recv_message(client) == b"hi!"
142+
finally:
143+
server.close()
144+
client.close()
145+
146+
89147
def test_handshake_rejects_non_websocket_request():
90148
server, client = _make_socketpair()
91149
try:
@@ -114,7 +172,6 @@ def _start_ws_host(token: str = "tok",
114172
return host
115173

116174

117-
@pytest.mark.flaky(reruns=2, reruns_delay=1)
118175
def test_ws_viewer_authenticates_and_receives_frames():
119176
host = _start_ws_host()
120177
try:
@@ -131,7 +188,6 @@ def test_ws_viewer_authenticates_and_receives_frames():
131188
host.stop(timeout=1.0)
132189

133190

134-
@pytest.mark.flaky(reruns=2, reruns_delay=1)
135191
def test_ws_viewer_with_wrong_token_is_rejected():
136192
host = _start_ws_host(token="right")
137193
try:
@@ -145,7 +201,6 @@ def test_ws_viewer_with_wrong_token_is_rejected():
145201
host.stop(timeout=1.0)
146202

147203

148-
@pytest.mark.flaky(reruns=2, reruns_delay=1)
149204
def test_ws_viewer_input_reaches_host_dispatcher():
150205
host = _start_ws_host()
151206
try:
@@ -166,7 +221,6 @@ def test_ws_viewer_input_reaches_host_dispatcher():
166221
host.stop(timeout=1.0)
167222

168223

169-
@pytest.mark.flaky(reruns=2, reruns_delay=1)
170224
def test_ws_host_announces_host_id():
171225
host = _start_ws_host(host_id="700800900")
172226
try:

0 commit comments

Comments
 (0)