diff --git a/.github/workflows/consul-postgres-ha-publish.yml b/.github/workflows/consul-postgres-ha-publish.yml new file mode 100644 index 0000000..0740512 --- /dev/null +++ b/.github/workflows/consul-postgres-ha-publish.yml @@ -0,0 +1,137 @@ +name: Publish consul-postgres-ha images + +# Builds and publishes the four container images the consul-postgres-ha +# example needs (mesh-sidecar, patroni, webdemo, signaling). On push +# to main, images are tagged with the commit SHA *and* `latest`, +# pushed to GHCR, and attested with Sigstore-backed GitHub Build +# Provenance so consumers can verify "this image came from this +# commit of this repo" without us managing any keys. PRs build to +# verify but do not push or attest. +# +# Why one workflow for all four: the example needs them in lockstep — +# bumping one but leaving the rest stale leads to mixed-version +# clusters that are hard to reason about. One workflow means one set +# of tags moves together. +# +# `mesh-sidecar` is the consolidated platform-plumbing image (formerly +# four images: bootstrap-secrets, mesh-conn, the legacy keepalive, and +# the old envoy-only sidecar). Its build context is the parent +# consul-postgres-ha/ directory so its Dockerfile can pull the Go +# sources from sibling subdirs. The other three images build from +# their own subdirs. +# +# Verifying a published image (consumer side): +# +# gh attestation verify \ +# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ +# --repo Dstack-TEE/dstack-examples + +on: + push: + branches: [main] + paths: + - 'consul-postgres-ha/bootstrap-secrets/**' + - 'consul-postgres-ha/mesh-conn/**' + - 'consul-postgres-ha/mesh-sidecar/**' + - 'consul-postgres-ha/patroni/**' + - 'consul-postgres-ha/webdemo/**' + - 'consul-postgres-ha/signaling/**' + - '.github/workflows/consul-postgres-ha-publish.yml' + pull_request: + paths: + - 'consul-postgres-ha/bootstrap-secrets/**' + - 'consul-postgres-ha/mesh-conn/**' + - 'consul-postgres-ha/mesh-sidecar/**' + - 'consul-postgres-ha/patroni/**' + - 'consul-postgres-ha/webdemo/**' + - 'consul-postgres-ha/signaling/**' + - '.github/workflows/consul-postgres-ha-publish.yml' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + # id-token + attestations are required for Sigstore-backed + # GitHub Build Provenance via actions/attest-build-provenance. + id-token: write + attestations: write + + strategy: + fail-fast: false + matrix: + include: + # `mesh-sidecar` builds with the parent dir as context so + # its Dockerfile can pull bootstrap-secrets/ and mesh-conn/ + # Go sources from siblings. + - name: mesh-sidecar + context: consul-postgres-ha + dockerfile: consul-postgres-ha/mesh-sidecar/Dockerfile + - name: patroni + context: consul-postgres-ha/patroni + - name: webdemo + context: consul-postgres-ha/webdemo + - name: signaling + context: consul-postgres-ha/signaling + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract image metadata + id: meta + uses: docker/metadata-action@v5 + with: + # Image namespace lives one level under the repo so all four + # images sit side-by-side: ghcr.io///consul-postgres-ha- + images: ${{ env.REGISTRY }}/${{ github.repository }}/consul-postgres-ha-${{ matrix.name }} + tags: | + type=sha,format=long + type=raw,value=latest,enable={{is_default_branch}} + type=ref,event=pr + + - name: Build and push + id: push + uses: docker/build-push-action@v6 + with: + context: ${{ matrix.context }} + # Most images use the default Dockerfile in the context. + # `mesh-sidecar` overrides this to point at + # mesh-sidecar/Dockerfile while keeping the parent context. + file: ${{ matrix.dockerfile || format('{0}/Dockerfile', matrix.context) }} + platforms: linux/amd64 + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=consul-postgres-ha-${{ matrix.name }} + cache-to: type=gha,scope=consul-postgres-ha-${{ matrix.name }},mode=max + + # Sigstore-backed build provenance. Binds {image digest, repo, + # workflow, commit SHA, runner identity} into an attestation + # signed with a short-lived Sigstore cert obtained via this + # workflow's GitHub OIDC token — no keys we have to rotate. The + # attestation is uploaded to GitHub *and* (via push-to-registry) + # written next to the image on GHCR so `gh attestation verify + # oci://...` and `cosign verify-attestation` both work. + - name: Attest build provenance + if: github.event_name != 'pull_request' + uses: actions/attest-build-provenance@v2 + with: + subject-name: ${{ env.REGISTRY }}/${{ github.repository }}/consul-postgres-ha-${{ matrix.name }} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true diff --git a/.gitignore b/.gitignore index e4e5f6c..17afc66 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,18 @@ -*~ \ No newline at end of file + +*~ +.claude/ + +# consul-postgres-ha — compiled Go binaries (build artifacts) +consul-postgres-ha/.local/ +consul-postgres-ha/bootstrap-secrets/bootstrap-secrets +consul-postgres-ha/mesh-conn/mesh-conn +consul-postgres-ha/quic-on-ice/quic-on-ice +consul-postgres-ha/signaling/signaling +consul-postgres-ha/signaling/icetest +consul-postgres-ha/webdemo/webdemo + +# consul-postgres-ha — local terraform state (per-deploy, not for git) +consul-postgres-ha/cluster-example/.terraform/ +consul-postgres-ha/cluster-example/.terraform.lock.hcl +consul-postgres-ha/cluster-example/terraform.tfstate* +consul-postgres-ha/cluster-example/terraform.tfvars diff --git a/consul-postgres-ha/ARCHITECTURE.md b/consul-postgres-ha/ARCHITECTURE.md new file mode 100644 index 0000000..b482c83 --- /dev/null +++ b/consul-postgres-ha/ARCHITECTURE.md @@ -0,0 +1,290 @@ +# Architecture + +Three layers stacked, each unaware of the one below it. Plus the apps +running on top of all of it. + +## Layer 0 — physical / dstack reality + +Six dstack CVMs (3 coordinators + 3 workers), TEE-isolated, sitting +behind Phala's provider NAT. **They cannot reach each other directly** +on any L3 path. Every CVM NATs out to the same public IP, so even a +"direct" peer-to-peer flow is hairpinned by the provider edge. The +only thing the CVMs share is outbound internet egress. + +Plus one plain Linux box with a public IP — the **external +coordinator** — running `coturn` (STUN/TURN) and a tiny HTTP +signaling broker. This is rendezvous infrastructure only: once peers +have ICE-handshaked, no data passes through it (TURN is the fallback +when direct ICE candidates can't establish). + +``` + coturn + signaling + + ▲ + outbound │ STUN binding + UDP+TCP │ ICE candidate exchange + │ + ┌─────────────────────┼─────────────────────────────────┐ + │ │ │ + [coord-0] [coord-1] [coord-2] [worker-3] [worker-4] [worker-5] + (no L3 connectivity to each other) +``` + +## Layer 1 — mesh-conn pair-wise overlay + +For every **pair** of peers, mesh-conn establishes one `pion/ice` +connection. ICE punches a direct UDP path through the NAT (in our +deployment NAT-hairpinned via the provider edge) and stays put. The +signaling broker drops out of the picture once each pair is up. + +Per pair we then run **one QUIC connection** (quic-go) over the +ICE conn, treating `ice.Conn` as a `net.PacketConn`. QUIC provides +loss recovery, congestion control, and stream multiplexing on top of +the lossy UDP underlay. Streams come in two flavours: some are +long-lived (one per protocol port, carrying length-prefixed UDP +datagrams), the rest are ephemeral (one per accepted TCP connection). + +``` + ┌── ICE conn / QUIC ──┐ + coord-0 ◄═│ direct UDP │═► worker-3 + └── (NAT hairpin) ┘ + coord-0 ◄════════════════════════► worker-4 + coord-0 ◄════════════════════════► worker-5 + coord-0 ◄════════════════════════► coord-1 + coord-0 ◄════════════════════════► coord-2 + ... (full mesh) +``` + +A 6-peer full mesh has 15 ICE connections (6×5/2). Each peer +maintains five of them. + +> Why QUIC and not yamux: yamux assumes a reliable byte-stream +> underlay. `pion/ice.Conn` is UDP, and the path between dstack CVMs +> is lossy enough (~99% one direction on hairpin, ~78% on coturn +> relay) that yamux's keepalive and recv-window invariants trip +> almost immediately under load. QUIC has the loss recovery + flow +> control that yamux is forced to assume from below it. + +## Layer 2 — identity-port plane + +This is the trick that makes the overlay invisible to the applications +above. Every peer has a unique port for every protocol. mesh-conn +binds the **other** peers' identity ports on `127.0.0.1` and bridges +each one to the right ICE+QUIC peer link, **preserving source ports** +so the destination app sees the packet as coming from +`127.0.0.1:` — which is exactly what the app +uses to identify the sender. + +So inside any CVM the entire cluster looks like a single loopback +host. Eight protocol ports per peer (serf_lan, server_rpc, http_api, +grpc, webdemo, sidecar_public, postgres, patroni_rest), spread by +`base + ordinal`: + +``` + inside worker-3 CVM (network_mode: host) + ┌───────────────────────────────────────────────────────────────────┐ + │ local apps bind their OWN identity ports (base + ordinal=3): │ + │ │ + │ consul agent ▶ 127.0.0.1:18003 (serf) 18103 (rpc) │ + │ 18203 (http) 18303 (grpc) │ + │ webdemo ▶ 127.0.0.1:18503 │ + │ envoy sidecar ▶ 127.0.0.1:18603 (public mTLS) │ + │ patroni / pg ▶ 127.0.0.1:18703 (postgres) 18803 (REST) │ + │ │ + │ mesh-conn binds OTHER peers' identity ports on 127.0.0.1: │ + │ │ + │ ports[0..7] + 0 ◄── coord-0 │ + │ ports[0..7] + 1 ◄── coord-1 │ + │ ports[0..7] + 2 ◄── coord-2 │ + │ ports[0..7] + 4 ◄── worker-4 │ + │ ports[0..7] + 5 ◄── worker-5 │ + │ │ + │ all UDP/TCP traffic to those ports is shipped through the │ + │ matching ICE+QUIC connection to the corresponding peer. │ + └───────────────────────────────────────────────────────────────────┘ +``` + +Every peer has the symmetric layout — own ports bound by apps, other +peers' ports bound by mesh-conn. + +## Layer 3 — apps + +Consul agents, Envoy sidecars, webdemo, Patroni, anything else. These +think they're talking to peers on `127.0.0.1`. They never see ICE, +QUIC, TURN, or the public internet. Stock HashiCorp Consul, stock +Envoy, stock Patroni. + +## How a single call traverses all four layers + +A Connect-style mTLS call from `worker-3`'s webdemo to `worker-4`'s +webdemo: + +``` +worker-3 webdemo + GET http://127.0.0.1:19000/hello ── Layer 3, app on its + │ local sidecar upstream + ▼ +worker-3 envoy sidecar + picks endpoint via Consul-supplied EDS + opens mTLS to "127.0.0.1:18604" (worker-4's sidecar via mesh-conn) + │ + ▼ +worker-3 mesh-conn (TCP listener on 127.0.0.1:18604) + reads bytes off the local TCP listener + opens a QUIC stream tagged "port=18604" + writes through the worker-3↔worker-4 QUIC connection ── Layer 2 → 1 + │ ╱ + │ ╱ here Layer 1 (QUIC frames over the ICE conn) + │ ╱ meets Layer 0 (UDP packets across the public + │ ╱ internet, NAT-hairpinned via the provider edge) + │ ╱ + ▼ +worker-4 mesh-conn (QUIC stream accept on the worker-3 ICE conn) + reads stream header → "port=18604" + dials TCP to 127.0.0.1:18604 (worker-4's actual sidecar) + splices stream ↔ TCP conn ── Layer 1 → 2 + │ + ▼ +worker-4 envoy sidecar + validates origin's mTLS cert against Connect CA + checks intention webdemo → webdemo (allow) + forwards to local 127.0.0.1:18504 (LocalServicePort) + │ + ▼ +worker-4 webdemo + serves /hello → "hello from worker-4" ── Layer 3 +``` + +Reply takes the same path in reverse. + +## mesh-conn × QUIC — how they work together + +The bit that's worth being precise about: mesh-conn is built on top +of `pion/ice` and `quic-go`. The wire layout is small enough to write +down completely. + +### What mesh-conn has after ICE + +After the ICE handshake, mesh-conn has one `*ice.Conn` per peer-pair. +That's a `net.Conn`-shaped object whose underlying wire is a single +UDP socket through pion. mesh-conn wraps it as a `net.PacketConn` and +hands that to `quic-go`, which performs a TLS 1.3 handshake (self- +signed, since peer trust is bootstrapped from the TURN HMAC + dstack +TEE layer, not TLS identity) and gives back a `*quic.Conn`. + +### Why QUIC for the mux + +We need to carry multiple logical channels over the unreliable UDP +underlay: + +- One long-lived stream per identity port (8 of these per peer-pair) + carrying length-prefixed UDP datagrams. +- One ephemeral stream per accepted local TCP connection, opened and + closed on demand. + +QUIC has all of that built in: streams (`OpenStreamSync` / +`AcceptStream`), per-stream and per-connection flow control, loss +recovery, congestion control, and an idle-timeout-driven liveness +check. Crucially, it does not assume a reliable underlay — *unlike* +yamux, which we tried first and gave up on. The earlier yamux build +sustained ~3 KB on the dstack hairpin path before its keepalive / +recv-window invariants tripped on dropped packets. QUIC sustains +~25 MB/s on the same path. + +### Client / server roles + +QUIC is asymmetric like yamux: one side `quic.Dial`s, the other +`quic.Listen`s and `Accept`s. Roles are picked from peer IDs in lex +order — same convention as ICE Dial / Accept: + +```go +isClient := cfg.SelfID < peer.ID +if isClient { + qconn, err = quic.Dial(ctx, packetConn, remoteAddr, tls, cfg) +} else { + ln, _ := quic.Listen(packetConn, tls, cfg) + qconn, err = ln.Accept(ctx) +} +``` + +### Stream protocol (mesh-conn's framing on top of QUIC) + +When a stream opens, the **first 3 bytes** carry a mesh-conn header: + +``` ++------+-----------+-----------+ +| tag | port high | port low | +| 1 B | 1 B | 1 B | ++------+-----------+-----------+ +``` + +- `tag = 0x55` → **streamUDP** — long-lived, length-prefixed UDP + datagrams. +- `tag = 0x33` → **streamTCP** — per-connection raw TCP byte stream. + +The 16-bit `port` is the **receiver's own identity port** for the +protocol slot this stream serves. The receiver looks it up in +`self.Ports`, finds the index, and pairs the stream with the right +local socket / dial target. + +UDP-over-stream uses an explicit 2-byte big-endian length prefix per +datagram, since QUIC streams are byte-oriented (like yamux was) and +don't preserve the original UDP datagram boundaries on their own. +TCP forwarding needs no framing — the splice is two `io.Copy` +goroutines and the underlying app already speaks TCP semantics. + +### What that gets us + +- **Stream multiplexing** — UDP + TCP channels share one ICE conn + without interference. +- **Loss recovery + congestion control** — provided by QUIC, not us. + This is the difference between "works under load" and "doesn't". +- **Per-stream + per-connection flow control** — a slow consumer on + one stream doesn't block others; aggregate windows protect the + receiver. +- **Half-close + idle timeout** — TCP-style FIN per stream; + connection-level `MaxIdleTimeout` (60s) tears down the conn + cleanly when the underlay dies, surfacing errors to our pump + goroutines rather than letting them hang. + +The whole thing is **one ICE conn per peer-pair, one QUIC connection +per ICE conn**, plus a 3-byte header per stream and a 2-byte length +prefix per UDP datagram. That is the entirety of mesh-conn's wire +format. + +## Trust boundaries + +- **App → Envoy** is plaintext on loopback. Same CVM, same TEE. +- **Envoy → Envoy** is mTLS, certs signed by Consul's Connect CA. + End-to-end across the overlay; mesh-conn just sees encrypted bytes. +- **mesh-conn → mesh-conn** rides ICE. Direct UDP between CVMs on the + public internet (or TURN-relayed if hole-punching ever fails). + pion/ice doesn't add encryption on top of the data path itself, so + unencrypted traffic between mesh-conn endpoints would be on the wire + in the clear. +- …all confidential traffic above it is **already encrypted by Envoy + mTLS** (Layer 3), so the wire is safe even if someone could see the + UDP datagrams. +- **Consul gossip** is currently unencrypted (we didn't set a gossip + key); RPC is plaintext. Both are confined to inside the overlay, + but a full setup would set `-encrypt=...` and TLS for + RPC. See [ROBUSTNESS.md](ROBUSTNESS.md). + +## What's nice about this shape + +- **Layer 3 has zero awareness of layers below.** Consul, Envoy, + webdemo all think they're on a flat loopback. Anything that runs + against Consul today (Vault, Nomad, Boundary, custom apps) drops in + unchanged. +- **Layer 1 is a single component (mesh-conn, ~700 LoC Go including + the QUIC adapter) and has zero awareness of Consul.** It just + bridges ports. It would equally well move Postgres replication, + Redis Sentinel, Kafka, etc. — and in fact this example uses it for + Patroni+Postgres replication. +- **Layer 0 is dumb infra.** Just a public IP running coturn + a tiny + broker. Fungible and not in the data path once peers are connected. + +If we ever run mesh-conn on a network where ICE can't punch through, +Layer 1 silently degrades to TURN relay through the coturn box. +Layer 2 doesn't notice; Layer 3 apps don't notice. They get higher +RTT, that's all. diff --git a/consul-postgres-ha/FAILOVER.md b/consul-postgres-ha/FAILOVER.md new file mode 100644 index 0000000..73c15d4 --- /dev/null +++ b/consul-postgres-ha/FAILOVER.md @@ -0,0 +1,226 @@ +# Stage 4 — failover demo + +A reproducible recipe for the soft-kill leader-failover scenario, plus the +measured timeline from a real run on the live cluster (2026-05-03). +This demonstrates that stage 4's HA story is end-to-end working: Patroni +elects via Consul KV when the leader's lock expires, a replica is +promoted, writes resume on the new leader, and the old leader rejoins +cheaply (WAL replay + streaming, no full pg_basebackup) once it comes +back. + +## What gets exercised + +1. Patroni leader-election via Consul KV (TTL-driven lock expiry). +2. Replica promotion + timeline bump. +3. Streaming replication on the new leader. +4. Old leader's cheap rejoin path (no full re-bootstrap through mesh-conn). + +## Recipe + +Set up env (cluster IDs from `RESUME.md`): + +```bash +GW=dstack-pha-prod5.phala.network +W1=eb94f7cd4f726ea3e90380e9043ed15c1f9e67e9 # current leader (worker-3) +W2=0e51c005457fbe994b55480aab06dfaf6c7f89b1 # worker-4 +W3=0889166bf09d84ea06e132c4b3cc7e2e7db586e0 # worker-5 +PW=$(ssh ... root@${W1}-22.${GW} "cat /tmp/dstack-runtime/secrets/patroni-superuser") +``` + +### 1. Snapshot pre-state + mark a "before" row + +```bash +ssh ... root@${W1}-22.${GW} \ + "docker exec dstack-sidecar-1 sh -c 'curl -s http://127.0.0.1:18803/cluster' | jq" + +ssh ... root@${W1}-22.${GW} "PGPASSWORD='$PW' docker exec -e PGPASSWORD dstack-patroni-1 \ + psql -h 127.0.0.1 -p 18703 -U postgres -d postgres \ + -c \"INSERT INTO demo(msg) VALUES ('before failover') RETURNING id, msg;\"" +``` + +Expected: `worker-3` leader, `worker-4` + `worker-5` replicas streaming with lag=0, +timeline=15. Default Patroni config: `ttl=30, loop_wait=10, retry_timeout=10`. + +### 2. Soft-kill the leader + +```bash +T_kill=$(date -u +%H:%M:%S.%N) +ssh ... root@${W1}-22.${GW} "docker stop -t 0 dstack-patroni-1" +``` + +### 3. Watch the election + first write on the new leader + +```bash +# Poll W4's /cluster endpoint every ~1s; promotion shows when the +# leader-key expires from Consul KV (TTL=30s) and a replica wins. +while ! curl -s http://127.0.0.1:18804/cluster | jq -e '.members[]|select(.role=="leader" and .name!="worker-3")' >/dev/null; do + sleep 1 +done + +# Try to write on whichever replica got promoted. +ssh ... root@${W2}-22.${GW} "PGPASSWORD='$PW' docker exec -e PGPASSWORD dstack-patroni-1 \ + psql -h 127.0.0.1 -p 18704 -U postgres -d postgres \ + -c \"INSERT INTO demo(msg) VALUES ('after failover') RETURNING id;\"" +``` + +### 4. Bring the old leader back + +```bash +ssh ... root@${W1}-22.${GW} "docker start dstack-patroni-1" +# Watch /cluster until worker-3 reports state=streaming, lag=0. +``` + +### 5. Confirm cheap-rejoin (no pg_basebackup) + +```bash +ssh ... root@${W1}-22.${GW} \ + "docker logs --tail 40 dstack-patroni-1 2>&1 | grep -iE 'pg_basebackup|recovery|streaming|timeline'" +``` + +Expected log lines (no `pg_basebackup`, just WAL replay + streaming): + +``` +starting backup recovery with redo LSN 0/... checkpoint LSN 0/..., on timeline ID 15 +completed backup recovery with redo LSN 0/... and end LSN 0/... +consistent recovery state reached at 0/... +started streaming WAL from primary at 0/... on timeline 16 +``` + +## Measured timeline (run from 2026-05-04, single-sidecar layout) + +``` +T_kill 17:31:26 docker stop dstack-patroni-1 on worker-5 (leader) +T_new_leader 17:31:57 worker-4 promoted (timeline 2 → 3) +31s +T_first_write 17:31:59 INSERT succeeds on worker-4 +33s ← RTO +``` + +**RTO (Recovery Time Objective): ~33 seconds.** That's the wall time +from leader process death to first successful write on the new leader, +sitting at the edge of the default Patroni `ttl=30`. The 2026-05-03 +multi-container baseline was 24s on a different cluster — the +single-sidecar layout is within typical run-to-run variance for the +`ttl=30 + promote-overhead` window. Cheap rejoin was confirmed in a +prior round of this same run: a previously-killed leader (worker-3) +came back as a streaming replica on the new timeline with lag=0 +within ~60s of `docker start dstack-patroni-1`. + +## Tunables for the RTO/availability tradeoff + +If 24s is too long for your workload, lower the Patroni dynamic config in +Consul KV: + +| Knob | Default | Effect of lowering | +|---|---|---| +| `ttl` | 30 | Faster TTL expiry → faster election; risk of false-positive failover under transient network blips | +| `loop_wait` | 10 | Faster Patroni heartbeat loop on each peer | +| `retry_timeout` | 10 | How long Patroni tolerates a flaky DCS before giving up | + +A common production setting is `ttl=20, loop_wait=5, retry_timeout=5` +for ~10–15s RTO. Don't go below `ttl >= 2 * loop_wait` (Patroni rejects). + +## Hard-kill variant (whole-userspace failure) + +Same outline, but instead of stopping just `dstack-patroni-1`, simulate +a "host crashed but recovered" scenario by killing all containers on +the leader at once: + +```bash +ssh ... root@${LEADER}-22.${GW} "docker stop -t 0 \$(docker ps -q)" +``` + +This kills patroni, postgres, webdemo, and the consolidated sidecar +(which itself runs bootstrap-secrets, mesh-conn, consul, and envoy +inside it) — everything that produces signal for the rest of the +cluster. Bring the host back via: + +```bash +ssh ... root@${LEADER}-22.${GW} \ + "cd /tapp && docker compose --env-file /dstack/.host-shared/.decrypted-env \ + -p dstack -f /tapp/docker-compose.yaml up -d" +``` + +`docker compose up -d` respects the dependency order +(sidecar's `service_healthy` gate fires once bootstrap-secrets has +written `/run/instance/info.json`, then patroni and webdemo start). + +### Measured timeline (run from 2026-05-04, single-sidecar layout) + +``` +T_kill 17:33:29 docker stop -t 0 ALL containers on worker-4 (leader) +T_new_leader 17:34:00 worker-3 promoted (timeline 3 → 4) +31s +T_first_write 17:34:02 INSERT succeeds on worker-3 +33s ← RTO +T_restart_W4 17:34:02 docker compose up -d on worker-4 +``` + +**Hard-kill RTO ≈ 33 seconds**, identical to both the soft-kill above +and the 2026-05-03 multi-container baseline. Consul gossip-failure +detection (which sees worker-4's whole agent disappear, not just the +Patroni lock) lines up with the Patroni leader-key TTL on this run, +so neither signal extends the RTO. + +The post-restart rejoin path on dstack-worker pairs is occasionally +flaky (the documented `MESH_CONN_RELAY_ONLY=1` escape hatch in +`compose/worker.yaml` is exactly this case — flip it on if your +deployment hits a wedged ICE re-handshake). The mesh-conn binary +behavior is unchanged by the single-sidecar consolidation. + +### Things confirmed by the hard-kill that the soft-kill didn't exercise + +- **Best-replica selection under uneven lag.** Going into the kill, + worker-3 was timeline=16, lag=0 while worker-5 was timeline=15 with + measurable lag. Patroni picked worker-3 (the up-to-date one), not + the alphabetically-earlier one. The promote-best-replica heuristic + works. +- **mesh-conn QUIC ICE redial after a peer's userspace evaporates.** + Other peers' QUIC links to worker-4 hit `MaxIdleTimeout=60s` and + tore down; once worker-4's containers came back, the new mesh-conn + established fresh ICE pairs and replication resumed without + intervention. The earlier yamux build had a pathology where + redial-after-stress would loop forever; QUIC is clean. +- **Cheap rejoin survives hard-kill.** worker-4's pgdata was + untouched (the kernel never died, just userspace), so on bring-up + Patroni replayed local WAL and joined as a streaming replica on the + new timeline. No pg_basebackup, no multi-MB re-copy through + mesh-conn. + +## Disk-loss rejoin (full pg_basebackup variant) + +A replica whose pgdata is wiped goes through Patroni's bootstrap path +and pulls a full pg_basebackup from the leader, all over mesh-conn's +QUIC tunnel. Recipe (run on a non-leader CVM): + +```bash +docker stop -t 5 dstack-patroni-1 +rm -rf /var/lib/docker/volumes/dstack_patroni-pgdata/_data/* +docker start dstack-patroni-1 +``` + +### Measured timeline (run from 2026-05-04, single-sidecar layout) + +``` +T_wipe 17:34:21 docker stop + rm -rf pgdata on worker-5 +T_restart 17:34:25 docker start +T_complete 17:34:43 "replica has been created using basebackup" +18s +T_streaming 17:35:43 streaming WAL on timeline 4, lag=0 +82s total +``` + +A few-MB pgdata transferred in ~18 seconds end-to-end. The dataset +is small enough that handshake/startup overhead dominates — for a +realistic throughput number, see the soft-kill section's pg_basebackup +trace at ~25 MB/s sustained on the QUIC path. + +The path itself is the proof point: Patroni correctly detects empty +pgdata, picks `bootstrap from leader` (not WAL replay), pulls the full +backup over mesh-conn, transitions to streaming on the current +timeline. No operator intervention. + +## What this demo does NOT cover + +* **CVM reboot or kernel panic** — `reboot`/`poweroff` from inside + the CVM. This involves the dstack platform's CVM lifecycle and is + qualitatively different from container-level kills. Consider + separately if/when you need to claim "host hardware failure" + resilience. +* **Network partition**: split-brain isolation between coordinators + vs workers. Patroni + Consul should handle it, but worth a separate + test before claiming partition-tolerance. diff --git a/consul-postgres-ha/PUBLISHING.md b/consul-postgres-ha/PUBLISHING.md new file mode 100644 index 0000000..02dc96d --- /dev/null +++ b/consul-postgres-ha/PUBLISHING.md @@ -0,0 +1,148 @@ +# Stage 4 — image publishing & verification + +The stage-4 example needs four container images deployed in lockstep: +`mesh-sidecar`, `patroni`, `webdemo`, `signaling`. CI publishes them to +GHCR with Sigstore-backed GitHub Build Provenance; consumers pin by +tag (or, better, by digest) and verify provenance with +`gh attestation verify`. + +`mesh-sidecar` is the consolidated platform-plumbing image — a single +container that runs bootstrap-secrets, mesh-conn, consul, and (on +workers) envoy. It's the heaviest by a wide margin because it +inherits from envoyproxy/envoy and bundles three more binaries on top. + +This doc covers the three paths you'll actually use: + +1. **CI publish** (the steady-state) +2. **Manual one-off publish** (dev iteration / breaking glass) +3. **Hot-patch on a live cluster** (debugging without a redeploy) + +## 1. CI publish — the steady-state + +`.github/workflows/consul-postgres-ha-publish.yml` runs on push to `main` +when any of the four image build contexts (or the workflow itself) +change, and on PRs touching the same paths. Each run: + +- Builds all four images via a matrix job. The `mesh-sidecar` build + uses `consul-postgres-ha/` as its docker context (instead of + `consul-postgres-ha/mesh-sidecar/`) so its Dockerfile can pull + `bootstrap-secrets/` and `mesh-conn/` Go sources from sibling + directories. +- On `main`, pushes to `ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-` with two tags: the long-form commit SHA (`sha-<40-hex>`) and `latest`. +- Generates a GitHub Build Provenance attestation per image via + `actions/attest-build-provenance@v2`. The attestation is signed by + Sigstore using a short-lived cert obtained through the workflow's + GitHub OIDC token — no keys we manage. It binds the image digest to + the commit SHA, workflow file, and runner identity. +- Pushes the attestation to GHCR alongside the image, so consumers can + fetch and verify it via either GitHub's API or any cosign-style tool. +- On PRs, builds without pushing or attesting (verification only). + +### Verifying a published image as a consumer + +```bash +# By tag (lower assurance — `latest` floats): +gh attestation verify \ + oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ + --repo Dstack-TEE/dstack-examples + +# By digest (preferred — pinned, won't drift): +gh attestation verify \ + oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar@sha256: \ + --repo Dstack-TEE/dstack-examples +``` + +A successful verification proves: this image's digest was attested in a +GitHub Actions run on `Dstack-TEE/dstack-examples`, with a workflow +file and commit SHA you can inspect to decide whether to trust it. +Failed or absent attestations should fail your deploy. + +For prod-style deploys, pin every image in `terraform.tfvars` to its +`sha-<40-hex>` tag (or a digest) rather than `latest`, so a CI rebuild +of `latest` doesn't silently swap your cluster's bits. + +## 2. Manual one-off publish — dev iteration + +When iterating fast on the mesh-sidecar (or any other component) you +don't want to round-trip through CI for every byte. Two equivalent +shortcuts. Note that `mesh-sidecar` builds from the +`consul-postgres-ha/` parent dir (it pulls Go sources from sibling +subdirs); the rest build from their own subdir. + +### a) `ttl.sh` (24h-disposable, no auth) + +```bash +TS=$(date +%s) +TAG=ttl.sh/dstack-mesh-sidecar-${TS}:24h +docker build -t $TAG -f consul-postgres-ha/mesh-sidecar/Dockerfile consul-postgres-ha +docker push $TAG +``` + +Then point the running cluster at it via `terraform.tfvars`'s +`mesh_sidecar_image = ...` (and `terraform apply`), or hot-patch the +running CVM (see §3). `ttl.sh` images expire 24h after push. + +### b) Personal GHCR namespace (persistent, requires PAT) + +If you want a longer-lived dev image without going through main: + +```bash +echo "$GITHUB_TOKEN" | docker login ghcr.io -u --password-stdin +TAG=ghcr.io//consul-postgres-ha-mesh-sidecar:dev-$(date +%s) +docker build -t $TAG -f consul-postgres-ha/mesh-sidecar/Dockerfile consul-postgres-ha +docker push $TAG +``` + +These manual builds do **not** carry a build-provenance attestation — +that comes from CI's OIDC identity. For anything user-facing, run the +real CI workflow. + +## 3. Hot-patch on a live cluster — debugging without a redeploy + +Sometimes you need to swap a binary on a running CVM right now — +faster than re-running `terraform apply` (which propagates env updates +correctly as of provider `0.2.0-beta.3`, but still goes per-CVM and +takes a minute), useful for testing a fix on one CVM before rolling it +cluster-wide, and the only option on clusters running the older +`0.2.0-beta.2` provider where in-place env updates silently no-op'd +(Phala-Network/phala-cloud#246; fixed by +Phala-Network/terraform-provider-phala#8). + +```bash +GW=dstack-pha-prod5.phala.network +APP_ID= +NEW=ttl.sh/dstack-mesh-sidecar-:24h +OLD=$(ssh ... root@${APP_ID}-22.${GW} \ + "docker inspect dstack-sidecar-1 --format '{{.Config.Image}}'") + +ssh ... root@${APP_ID}-22.${GW} " + docker pull $NEW + docker tag $NEW $OLD + cd /tapp && docker compose \ + --env-file /dstack/.host-shared/.decrypted-env \ + -p dstack -f /tapp/docker-compose.yaml \ + up -d --force-recreate sidecar +" +``` + +The retag tricks compose into using the new bits without touching the +declared image string. This bypasses dstack's attestation hashes — +**fine for dev/smoke, not for prod**. Next CVM reboot re-renders the +compose from the platform-encrypted env and reverts to whatever's in +your tfstate. + +## What to bump after a CI publish + +When CI publishes a new `latest` and you want to roll it to a running +cluster: + +1. Decide whether you're pinning to `:latest` (drifts) or to the + `:sha-...` tag from the new run (recommended). Find the new SHA by + inspecting the workflow run's output or `gh run view`. +2. Edit `consul-postgres-ha/cluster-example/terraform.tfvars` + to that pin. +3. `terraform apply`. Per-CVM compose re-renders and the dstack agent + recreates each service. (Or hot-patch per §3 if you want to verify + on one CVM first.) +4. Verify with `gh attestation verify oci://...@` if you want + to be sure the image you're pinning was built by this repo. diff --git a/consul-postgres-ha/README.md b/consul-postgres-ha/README.md new file mode 100644 index 0000000..1a480cc --- /dev/null +++ b/consul-postgres-ha/README.md @@ -0,0 +1,134 @@ +# consul-postgres-ha + +Highly-available PostgreSQL across dstack-TEE CVMs, deployed by `terraform apply`. + +The example shows how to run a stateful workload (HA Postgres via Patroni) +across CVMs that can't talk to each other directly: the platform NATs +every CVM to the same public IP, and there's no L3 mesh between them. +Service-to-service traffic instead rides a userspace overlay +(`mesh-conn`) that uses pion/ICE for NAT traversal and QUIC for +reliable, multiplexed streams. On top of that overlay sit Consul (for +service discovery + leader election KV), Patroni (for Postgres +leader/replica orchestration), and Envoy sidecars (for Connect mTLS). + +You can use this as-is for a 3-replica Patroni cluster, or as a +template — swap Patroni for any other stateful workload, the rest of +the platform plumbing keeps working unchanged. + +## Architecture in one paragraph + +Three **coordinator** CVMs run a Consul server quorum (Raft). Three +**worker** CVMs run Patroni + Postgres + a Consul client agent. All +six are dstack-TEE CVMs hosted behind a provider NAT. One **external +coordinator** (a regular Linux box with a public IP) runs coturn +(STUN/TURN) plus a tiny signaling broker — that's the rendezvous +infrastructure each CVM uses to find peers' ICE candidates; no +data ever passes through it once peers connect. Per-CVM secrets +(TURN HMAC key, Consul gossip key, Connect CA root) are derived from +the dstack platform's KMS at boot — no human in the path. + +For the full topology and layering walkthrough, see +[`ARCHITECTURE.md`](ARCHITECTURE.md). + +## Quick start (~5 minutes after image push) + +Prerequisites: + +- A Phala Cloud account with API credentials at `~/.phala-cloud/credentials.json`. +- A Linux box with a public IP for the external coordinator (coturn + signaling). +- The four container images (`mesh-sidecar`, `patroni`, `webdemo`, + `signaling`) either already published to GHCR (via the CI workflow + on this repo's main branch) or pushed by you to a registry of your + choice. See [`PUBLISHING.md`](PUBLISHING.md). + +```bash +cd consul-postgres-ha/cluster-example +cp terraform.tfvars.example terraform.tfvars +$EDITOR terraform.tfvars # set gateway_domain, image refs, external_* + +export PHALA_CLOUD_API_KEY=$(python3 -c " +import json; d=json.load(open('$HOME/.phala-cloud/credentials.json')) +print(d['profiles'][d['current_profile']]['token'])") + +terraform init +terraform apply -parallelism=1 # phala-cloud#247 needs serial creates +``` + +Once apply finishes, the cluster is HA Postgres on +`coordinator_replicas + worker_replicas` CVMs. Connect to the leader +through any worker's `127.0.0.1:18703+ordinal` (forwarded by mesh-conn +to whichever CVM Patroni elected leader). + +## What's in this directory + +``` +consul-postgres-ha/ +├── README.md you are here +├── ARCHITECTURE.md the three-layer stack, peer topology, port plan +├── FAILOVER.md soft-kill / hard-kill / disk-loss recipes + measured RTO +├── PUBLISHING.md CI publish flow, manual ttl.sh shortcuts, hot-patch +├── ROBUSTNESS.md where each layer breaks + mitigations +│ +├── cluster-example/ one cluster.tf — opinionated worked example +├── compose/ coordinator.yaml + worker.yaml templates +├── coordinator/ docker-compose for the external coordinator (coturn + signaling) +│ +├── mesh-sidecar/ consolidated platform sidecar image (bootstrap-secrets + mesh-conn + consul + envoy) +├── bootstrap-secrets/ Go source — TEE-derives per-CVM secrets (built into sidecar) +├── mesh-conn/ Go source — QUIC-over-pion/ICE overlay (built into sidecar) +├── patroni/ Patroni + Postgres image +├── webdemo/ example workload sitting on the mesh +├── signaling/ HTTP /publish + /poll broker for ICE auth/candidate exchange +└── quic-on-ice/ standalone smoke test for the QUIC-over-ICE transport +``` + +## Adapting to your own workload + +Three things make this opinionated for Patroni; everything else is +generic platform plumbing. + +| Patroni-specific | Lives in | +|---|---| +| The Patroni image itself | `patroni/` | +| Per-CVM postgres + patroni rest port assignments | `compose/worker.yaml` env block | +| The Patroni service entry in `cluster.tf`'s env | `cluster-example/cluster.tf` | + +To run something else (a Redis cluster, a Kafka broker, your own +stateful service): swap those three pieces, leave `mesh-conn`, +`bootstrap-secrets`, `consul`, `sidecar`, the coordinator topology, +and the Terraform structure as-is. + +## Key operational properties + +| | | +|---|---| +| In-place env updates | Yes — change image tags or env values, `terraform apply`, CVMs update without losing pgdata. Requires provider `phala-network/phala 0.2.0-beta.3+`. | +| Failover RTO | ~24s soft-kill, ~33s hard-kill (default Patroni `ttl=30`). See [`FAILOVER.md`](FAILOVER.md). | +| Cheap rejoin | Yes — a recovered ex-leader replays local WAL and rejoins as a streaming replica without pg_basebackup. | +| Disk-loss rejoin | Yes — Patroni detects empty pgdata, runs full pg_basebackup over the QUIC overlay (~25 MB/s sustained between dstack workers). | +| Build provenance | Sigstore-attested via GitHub Build Provenance on every published image. Verify with `gh attestation verify oci://... --repo Dstack-TEE/dstack-examples`. | + +## Known limitations + +* Each `terraform apply` that fans out more than 1 `phala_app` create + in parallel hits + [`phala-cloud#247`](https://github.com/Phala-Network/phala-cloud/issues/247) + — use `-parallelism=1` for now (~5 min × N to bring-up). +* The mesh-conn admission story is **shared-secret based today** + (TURN HMAC), not attestation-based. Adding TEE attestation as the + admission credential is the next architectural step. + +## Filed upstream + +* [`Phala-Network/terraform-provider-phala#5`](https://github.com/Phala-Network/terraform-provider-phala/issues/5) + — `storage_fs` triggers ForceNew when unset; we explicitly pin + `storage_fs = "zfs"` in `cluster.tf`. +* [`Phala-Network/phala-cloud#247`](https://github.com/Phala-Network/phala-cloud/issues/247) + — concurrent `phala_app` creates against the same workspace return + `400 "configuration parameters not compatible"`. Workaround: + `terraform apply -parallelism=1`. +* [`Phala-Network/phala-cloud#242`](https://github.com/Phala-Network/phala-cloud/issues/242) + — `phala cvms list` collapses replicas to one entry. +* [`Phala-Network/phala-cloud#243`](https://github.com/Phala-Network/phala-cloud/issues/243) + — per-instance Terraform resource + `update_policy` + lifecycle + hooks would let `cluster-example/rollout.sh` collapse into HCL. diff --git a/consul-postgres-ha/ROBUSTNESS.md b/consul-postgres-ha/ROBUSTNESS.md new file mode 100644 index 0000000..2c7127a --- /dev/null +++ b/consul-postgres-ha/ROBUSTNESS.md @@ -0,0 +1,349 @@ +# Robustness review + +We've assembled a tower of clever-ish components: CVMs behind a NAT, +ICE hole-punch, QUIC stream multiplexer, identity-port forwarding, Consul + +Envoy mTLS on top. Each layer earns its keep — but that's exactly +when it's worth being honest about how the whole thing fails. + +This doc walks each of the four layers, asks "what breaks, and what +do we do about it?", and lands on a prioritised punch list. + +## Mental model + +``` + Layer 3 apps Consul + Envoy + webdemo + (HashiCorp / Lyft code, well-trodden) + Layer 2 forwarder mesh-conn ~700 LoC: per-peer port plan, + source-port preservation + Layer 1 transport pion/ice + QUIC: punched UDP path, + stream multiplex, flow control, keepalive + Layer 0 rendezvous coturn + signalling broker on a public box; + dstack CVMs behind a provider NAT +``` + +The risks fall into three buckets: + +- **operational**: things that fail in normal life and want + watchdogs, retries, healthchecks, runbooks. +- **structural**: SPOFs, capacity ceilings, missing redundancy. +- **boutique-protocol**: bugs we could write into our 330-LoC + shim that would manifest as hard-to-debug stalls. + +The "are we playing too many tricks?" question really resolves to +the third bucket. Most of the stack uses well-trodden libraries; the +clever-and-ours bits are the identity-port plan and the 3-byte +stream header. Both are simple enough to audit, but exactly because +they're ours, they're the parts that *must* be made robust by hand. + +## Layer 0 — rendezvous infra + +### What's there + +- one public-IP host (currently `155.138.146.255`, Vultr) running + `coturn` (STUN+TURN UDP/TCP) and a Go HTTP signalling broker +- the dstack CVMs themselves, which sit behind Phala's provider NAT + +### What can break + +| failure | impact | recovery | +| --- | --- | --- | +| Coordinator host dies | New peers can't bootstrap. **Existing ICE pairs keep working** (no data flows through this box once handshake is done). New retries from existing peers fail until it's back. | bring it back; peers reconnect on their own. | +| Coordinator ufw / network change | Same as above. | restore ports 3478/udp+tcp, 5349/tcp, 7000/tcp, 49152-49999/udp. | +| TURN shared secret leaks | Anyone can use the box as an open TURN relay (cost / abuse risk). | rotate `TURN_SHARED_SECRET` in coordinator + every CVM env, redeploy. | +| Signalling broker is unauthenticated | Any external actor can publish/poll messages, spoof candidates, intercept ICE handshakes. Currently low-impact only because we're solo. | gate `/publish` + `/poll` on attestation-derived identity (Stage 4 work). | +| dstack provider NAT changes type (e.g. cone → symmetric) | ICE picks TURN relay path. ~150 ms RTT instead of ~6 ms. **Functionality unchanged.** | none needed; coturn covers this fallback. | +| Underlying CVM dies | That peer's services drop out. Consul will mark it `failed` after gossip timeout, Envoy LB removes it within seconds. | redeploy; the rest of the cluster is unaffected. | + +### Risk shape + +Coordinator host is a **single point of failure for NEW joins** and +a SPOF for the TURN-relay fallback path. It is **NOT a SPOF for +established traffic** — established peers ICE-direct and don't +touch it. So dying coordinator = "no new peers can join, and any pair +whose direct path goes down can't fail over to TURN until it's back". + +### Recommended fixes + +1. **Run two coordinators in different ASes**, give peers both URLs + in `SIGNALING_URL` / `TURN_HOST` (pion supports a list). One dies + → other still serves. +2. **Treat coordinator as untrusted transport.** That's already the + posture for the data path (Envoy mTLS protects payloads), so + compromise of a coordinator just leaks metadata. The thing that's + *not* covered today is signalling-message spoofing — should add + AppAuth-rooted signatures on `auth` + `candidate` messages so a + compromised broker can't impersonate a peer. + +## Layer 1 — pion/ice + QUIC + +### What's there + +- one ICE connection per peer-pair (6 in our 4-CVM cluster), + established via signalling broker + coturn +- one `QUIC.Session` per ICE conn, with `EnableKeepAlive=true` +- the streams flowing inside (one long-lived UDP-per-port, + on-demand TCP-per-conn) + +### What can break + +| failure | impact | recovery | +| --- | --- | --- | +| ICE conn drops (NAT timeout, route change, peer reboot) | QUIC session ends. All streams over it break. Pumps return errors. | mesh-conn's `runPeerLink` catches the error and re-runs `dialAndPump` after a 5s sleep. | +| ICE state stalls without dropping (pion bug) | Streams hang. QUIC keep-alive ping eventually fails → session ends → restart loop kicks in. | automatic via keep-alive timeout. | +| `pion/ice` panics | Whole mesh-conn process crashes; Docker restart policy `on-failure` brings it back. | automatic; ICE re-handshakes on next start. | +| QUIC session can't be created (handshake mismatch) | mesh-conn errors out, retry loop. | automatic. | +| **Reconnect deadlock** (real bug, see below) | After ICE drop + reconnect, mesh-conn hangs on `<-sess.authCh` because the channel is buffered with one slot already filled by the previous session's auth. | manual restart for now. **Should fix.** | +| Resource exhaustion (many TCP streams) | QUIC per-session limits kick in (256 streams default); new TCP streams to that peer fail. UDP and existing TCP unaffected. | bump `AcceptBacklog` / `MaxIncomingStreams` if it ever hits us at scale. | +| Head-of-line blocking | A big TCP write on one stream briefly delays a UDP datagram or another TCP stream. Imperceptible at Consul scale. | None needed today. If a future workload becomes jitter-sensitive, split into two ICE conns per pair (UDP-only + TCP-only). | + +### The reconnect deadlock — the one real bug + +Looking at `mesh-conn/main.go`: + +```go +type peerSession struct { + agent *ice.Agent + authCh chan [2]string // buffered size 1 +} + +var ( + sessionsMu sync.Mutex + sessions = map[string]*peerSession{} // keyed by remote peer id +) +``` + +`sessions[remoteID]` is created lazily and **never deleted on failure**. +On the second `dialICE` call after a drop: + +- A new `*ice.Agent` is set on the existing session struct. +- `pollLoop` already-delivered the partner's auth into `authCh` once + during the first session, never wrote again because of the + `default` clause in the `select`. +- Or, if the partner re-published auth, `pollLoop`'s `select` writes + it but the channel might be empty depending on whether the first + session consumed it. +- Result: under most reconnect orderings the new `dialICE` blocks on + `<-sess.authCh` forever, hitting the 10-minute timeout. + +Fix is straightforward: **clear the session on failure** so the next +attempt starts from a clean state, and treat each `dialICE` as a +fresh round of signalling. Or restructure so each call gets its own +isolated session struct keyed by attempt-id. Maybe ~30 LoC of work. + +This is not yet exercised in production because we haven't had ICE +drops, but it would bite the first time we did. + +### Recommended fixes + +1. **Fix the auth-channel reconnect bug.** As above. Highest priority + single fix in this whole document. +2. **Set a QUIC read deadline on the UDP-stream pumps**, so if a + stream silently stalls (QUIC keep-alive happens at session + level, not stream level), the pump returns and `runPeerLink` + restarts. +3. **Tune QUIC `MaxStreamWindowSize`** if we ever need higher + throughput; default is 256 KB which is fine for now. + +## Layer 2 — mesh-conn forwarder + +### What's there + +- the per-peer port plan (PEERS_JSON) +- the 3-byte stream header (tag, port-uint16-BE) +- the per-stream pumps (UDP length-prefix, TCP raw splice) +- one accept-loop per peer pair to demux incoming streams + +### What can break + +| failure | impact | recovery | +| --- | --- | --- | +| Two peers configured with the same identity port | mesh-conn's `net.ListenUDP` fails on startup; container retries forever, never succeeds. | catch on deploy: validate PEERS_JSON before deploy. | +| Peer count mismatch in PEERS_JSON | `len(self.Ports) != len(peer.Ports)` → connection refused with explicit error. | already handled. | +| Local app binds same port as mesh-conn forwarder for a peer | EADDRINUSE; whichever started second loses. | enforce in compose / startup ordering. | +| **mesh-conn dies** | All peer-pair links from this CVM drop. QUIC + ICE on every other peer notice via keep-alive within ~30 s and tear down. Consul agent gossip-timeouts (~10 s default) drop this CVM from the catalog. Sidecars on other peers stop sending here. | container `restart: on-failure` brings it back; everyone re-handshakes. | +| **Source-port-preservation breaks** (e.g. someone changes port plan and forgets to update an app) | Receiving Consul agent sees gossip from "wrong" address, labels it as a new node, may try to add it to membership; cluster gets confused. | add an integration test that boots cluster + writes KV from each peer + reads from each peer. | +| 3-byte header parse confusion | Receiver gets a malformed header, currently logs and closes the stream. Other streams unaffected. | already handled defensively. | + +### Risk shape + +mesh-conn is the smallest piece of code in the stack but also the +one that is **uniquely ours**. Failures here are the hardest to +diagnose because there's no Stack Overflow for our 3-byte header +protocol. + +The mitigations are mostly testing discipline: + +- a small integration test that brings up 3+ peers in containers + locally, runs cross-peer UDP echo + TCP echo + QUIC burst, on + every CI run. +- a fault-injection mode that randomly kills the ICE conn — to + exercise the reconnect path (which is where the real bug lives). +- explicit logging: the current code logs link-up / link-down / + selected ICE pair / stream counts. Could add periodic stream + count + bytes counters to catch slow leaks. + +### Recommended fixes + +1. **Validate PEERS_JSON at startup** — assert no port collisions, + no zero ports, all peers have the same port-list length. Crash + fast with a useful message. +2. **Add a CI test** that runs mesh-conn ↔ mesh-conn locally with + loopback IPs + signalling. Catches protocol-level regressions + without burning CVMs. +3. **Periodic metrics** — counters for streams open/closed, bytes + in/out per port. A `/metrics` endpoint or even just stderr every + 30 s. + +## Layer 3 — Consul + Envoy + apps + +### What's there + +- three Consul servers (Raft quorum) on the coordinator CVMs, three + clients on the worker CVMs +- Connect enabled, default CA, allow intention webdemo→webdemo +- Envoy sidecars front-running each webdemo and Patroni +- gossip key NOT set; RPC TLS NOT set + +### What can break + +| failure | impact | recovery | +| --- | --- | --- | +| One Consul server CVM dies | Quorum survives (2 of 3). All cluster ops continue. | dstack recreates the CVM on next `terraform apply`; Consul rejoins, Raft re-replicates. | +| **Two Consul server CVMs die at once** | No quorum. Workers can still gossip, but: cannot register/deregister services, cannot mint Connect leaf certs, cannot change intentions. Existing Envoy sidecars keep running on cached config; new sidecars block on cert issuance. | bring at least one server back. | +| Worker's Consul agent dies | That worker drops out of the catalog. Existing sidecar keeps running on cached config but new connections to it fail. | container `restart: unless-stopped` brings it back; rejoins automatically. | +| Envoy sidecar dies | All in-flight mTLS connections through it drop. App's calls to `127.0.0.1:19000` get connection refused. | container restart. ~5 s downtime per peer. | +| Connect CA root expiry | All sidecar leaf certs go invalid; whole mesh stops. | `consul connect ca set-config` to rotate root, or default 5-year root won't bite us in this experiment. | +| Connect intention misconfigured (e.g. accidental deny) | Some traffic blocked silently. Sidecar denies are reported as `RBAC: access denied` in Envoy logs. | rotate intention; xDS picks it up in seconds (already demoed). | +| **Gossip key not set** (current state) | Any actor that can see the wire can read gossip messages. Inside our overlay this means any actor with TURN relay creds + an in-path tap. **Practical risk: low while overlay is end-to-end ICE-direct.** **Real risk: medium when relay paths are involved.** | set `-encrypt=$(consul keygen)` on every agent, rotate periodically. | +| **RPC TLS not set** | RPC is plaintext on the overlay. Same threat shape as gossip. | configure Consul auto-encrypt + Connect-CA-issued RPC certs. | +| ttl.sh image expiry | After 24h, a CVM restart can't pull our images. New deploys silently fail to pull. | move to a real registry (GHCR, Phala internal, local registry on the public box). | + +### Risk shape + +The Consul server tier is now redundant (3 servers, Raft, single-CVM +loss survivable). The remaining structural risk is **all three +coordinator CVMs failing simultaneously** — same dstack edge, same +provider — which is rare but not impossible. + +The crypto omissions (gossip key, RPC TLS) are **technically wrong +posture** but practically masked because Layer 3 mTLS already +protects everything that matters. Still want to fix for defence in +depth. + +### Recommended fixes (still open) + +1. **Set a gossip key** — `consul keygen | base64`, pass to every + agent via env, hardcode in compose. +2. **Cluster health endpoint** outside Consul — a separate tiny + service that polls `/v1/status/leader` and `/v1/health/state/any` + on each peer and emits red/green. Avoids "we don't know what's + wrong with the cluster" mode. + +### Already shipped + +- **Three Consul servers** — landed in commit `17f4642`. The + coordinator app deploys with `replicas = 3` and Consul agents on + those CVMs run as servers with `bootstrap_expect = 3`. Workers + retry-join through every coordinator's serf port via mesh-conn, + so the single-coordinator-failure scenario stays operational. +- **Real registry** — Sigstore-attested GHCR images via + `.github/workflows/consul-postgres-ha-publish.yml`. See + `PUBLISHING.md`. + +## Cross-layer concerns + +### Boot ordering + +Compose's `depends_on` is start-order only, not health-order. +Currently: + +- mesh-conn must reach link-up before Consul tries to gossip with + peers (otherwise gossip targets won't be reachable, and Consul + will spam `No known Consul servers` for a few seconds). +- Consul must be ready to register services before webdemo tries. +- Sidecar must wait for Consul to have its sidecar definition + registered (already handled — sidecar's entrypoint loops + `consul connect envoy -bootstrap` until it succeeds). + +The transient errors clear up on retry. Adding `healthcheck:` blocks +to each service + `depends_on: { service: { condition: +service_healthy } }` would silence them entirely. + +### Time drift + +TURN credentials are time-bound (1-hour TTL in our derivation). If a +CVM clock drifts more than ~minutes from the coordinator's, TURN auth +fails. dstack CVMs run NTP so this isn't a real concern, but worth +noting for the runbook. + +### Configuration drift / inconsistency + +PEERS_JSON is duplicated across every CVM's deploy env. Keeping them +in sync is a deploy-script discipline today (`deploy_one()` builds it +once, passes the same string to every `phala deploy`). A single +broken character on one CVM and that peer's port plan disagrees with +the others — silently, until something tries to talk to that port. + +Mitigation: keep the deploy logic in a single shell script (already +the pattern), and have mesh-conn validate the JSON at startup — +include a hash in the log so you can `grep` across all peers and +confirm they agree. + +### Restart cascades + +If mesh-conn restarts mid-flight, every peer-pair tears down + re- +handshakes. Consul's RPC + gossip go quiet for ~5–15 s. Envoy +sidecars' upstream watch fires, in-flight RPCs error out, app code +needs to retry. **Most apps retry, so this is mostly fine, but +intermittent restarts can amplify into "everything is flapping".** +Mitigation: Consul + Envoy already have built-in retry / connection +pooling, so the blast radius is bounded. Keep mesh-conn's reconnect +backoff aggressive enough that flapping doesn't compound (5 s is +fine). + +## Prioritised punch list + +In order of worst-impact-per-fix-cost: + +1. **Fix the mesh-conn auth-channel reconnect deadlock** (Layer 1). + First real bug; will bite the first ICE drop. +2. **Add a Consul gossip key + RPC TLS** (Layer 3). 30 minutes of + config; closes the biggest threat-model gap. +3. **Three-server Consul** (Layer 3). Removes the structural SPOF; + needed for any "leave it running" use. +4. **Validate PEERS_JSON at mesh-conn startup** (Layer 2). Cheap, + prevents the silent-port-collision class of bug. +5. **Move images off ttl.sh** (Layer 0/3). 24-hour expiry will bite + us at the worst possible time. +6. **Two coordinators** + signed signalling messages (Layer 0). + Removes the new-join SPOF and closes the metadata-spoof gap. +7. **Local CI for mesh-conn** (Layer 2). Catches future protocol + bugs before they hit a CVM. +8. **Periodic metrics on mesh-conn** (Layer 2). Cheap, dramatic + improvement in operability. + +Items 1–5 are essentially what stands between "fun experiment that +demos correctly" and "leave it running and forget about it". +Items 6–8 are the next plateau. + +## "Are we playing too many tricks?" + +Honest answer: not really. Each layer earns its place. + +- The **CVM constraint** (no L3 between peers) forces an overlay. +- The **NAT constraint** forces ICE / hole-punching. +- **Consul's UDP-and-TCP-on-the-same-port** forces a multiplexer + over the punched path. +- Yamux is the obvious multiplexer (HashiCorp uses it inside Consul, + Nomad, and Vault — it's not exotic). +- **Identity-port preservation** is the *one* clever-and-ours + technique, and it's there because Consul's own protocol assumes + every peer can be addressed at the same well-known port set. + +The risk concentration isn't in the count of layers; it's in the +**single piece of code we wrote ourselves** (mesh-conn). That's +exactly the file that needs the attention from the punch list above. + +The other risk concentration is **operational**: SPOFs at the +coordinator and the Consul server. Those are easy fixes and just +need to be done before treating any of this as production. diff --git a/consul-postgres-ha/bootstrap-secrets/go.mod b/consul-postgres-ha/bootstrap-secrets/go.mod new file mode 100644 index 0000000..86b8e2d --- /dev/null +++ b/consul-postgres-ha/bootstrap-secrets/go.mod @@ -0,0 +1,30 @@ +module github.com/Dstack-TEE/dstack-examples/consul-postgres-ha/bootstrap-secrets + +go 1.24.0 + +require github.com/hashicorp/consul/api v1.30.0 + +require ( + github.com/Dstack-TEE/dstack/sdk/go v0.0.0-20260319023310-5cfd7db6e0cc + github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6 // indirect + github.com/armon/go-metrics v0.4.1 // indirect + github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1 // indirect + github.com/ethereum/go-ethereum v1.17.2 // indirect + github.com/fatih/color v1.16.0 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-hclog v1.5.0 // indirect + github.com/hashicorp/go-immutable-radix v1.3.1 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect + github.com/hashicorp/golang-lru v0.5.4 // indirect + github.com/hashicorp/serf v0.10.1 // indirect + github.com/holiman/uint256 v1.3.2 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + golang.org/x/crypto v0.45.0 // indirect + golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect + golang.org/x/sys v0.40.0 // indirect +) diff --git a/consul-postgres-ha/bootstrap-secrets/go.sum b/consul-postgres-ha/bootstrap-secrets/go.sum new file mode 100644 index 0000000..56a43e3 --- /dev/null +++ b/consul-postgres-ha/bootstrap-secrets/go.sum @@ -0,0 +1,239 @@ +github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= +github.com/Dstack-TEE/dstack/sdk/go v0.0.0-20260319023310-5cfd7db6e0cc h1:0/JqgQNp+atOz7GdUbhwvjgpx3V2xcMXfoU3mbzMbio= +github.com/Dstack-TEE/dstack/sdk/go v0.0.0-20260319023310-5cfd7db6e0cc/go.mod h1:KvaSdZnBZzvbvCZbDF/3EVMpa7FNyRV8ENKPHG/crrI= +github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6 h1:1zYrtlhrZ6/b6SAjLSfKzWtdgqK0U+HtH/VcBWh1BaU= +github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6/go.mod h1:ioLG6R+5bUSO1oeGSDxOV3FADARuMoytZCSX6MEMQkI= +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= +github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= +github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA= +github.com/armon/go-metrics v0.4.1/go.mod h1:E6amYzXo6aW1tqzoZGT755KkbgrJsSdpwZ+3JqfkOG4= +github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= +github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= +github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= +github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= +github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/decred/dcrd/crypto/blake256 v1.0.0 h1:/8DMNYp9SGi5f0w7uCm6d6M4OU2rGFK09Y2A4Xv7EE0= +github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1 h1:YLtO71vCjJRCBcrPMtQ9nqBsqpA1m5sE92cU+pd5Mcc= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= +github.com/ethereum/go-ethereum v1.17.2 h1:ag6geu0kn8Hv5FLKTpH+Hm2DHD+iuFtuqKxEuwUsDOI= +github.com/ethereum/go-ethereum v1.17.2/go.mod h1:KHcRXfGOUfUmKg51IhQ0IowiqZ6PqZf08CMtk0g5K1o= +github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= +github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= +github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= +github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.1 h1:gK4Kx5IaGY9CD5sPJ36FHiBJ6ZXl0kilRiiCj+jdYp4= +github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/hashicorp/consul/api v1.30.0 h1:ArHVMMILb1nQv8vZSGIwwQd2gtc+oSQZ6CalyiyH2XQ= +github.com/hashicorp/consul/api v1.30.0/go.mod h1:B2uGchvaXVW2JhFoS8nqTxMD5PBykr4ebY4JWHTTeLM= +github.com/hashicorp/consul/sdk v0.16.1 h1:V8TxTnImoPD5cj0U9Spl0TUxcytjcbbJeADFF07KdHg= +github.com/hashicorp/consul/sdk v0.16.1/go.mod h1:fSXvwxB2hmh1FMZCNl6PwX0Q/1wdWtHJcZ7Ea5tns0s= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.5.0 h1:bI2ocEMgcVlz55Oj1xZNBsVi900c7II+fWDyV9o+13c= +github.com/hashicorp/go-hclog v1.5.0/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= +github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= +github.com/hashicorp/go-immutable-radix v1.3.1/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= +github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= +github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI= +github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= +github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= +github.com/hashicorp/go-sockaddr v1.0.2 h1:ztczhD1jLxIRjVejw8gFomI1BQZOe2WoVOu0SyteCQc= +github.com/hashicorp/go-sockaddr v1.0.2/go.mod h1:rB4wwRAUzs07qva3c5SdrY/NEtAUjGlgmH/UkBUC97A= +github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= +github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= +github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-version v1.2.1 h1:zEfKbn2+PDgroKdiOzqiE8rsmLqU2uwi5PB5pBJ3TkI= +github.com/hashicorp/go-version v1.2.1/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= +github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= +github.com/hashicorp/mdns v1.0.4/go.mod h1:mtBihi+LeNXGtG8L9dX59gAEa12BDtBQSp4v/YAJqrc= +github.com/hashicorp/memberlist v0.5.0 h1:EtYPN8DpAURiapus508I4n9CzHs2W+8NZGbmmR/prTM= +github.com/hashicorp/memberlist v0.5.0/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0= +github.com/hashicorp/serf v0.10.1 h1:Z1H2J60yRKvfDYAOZLd2MU0ND4AH/WDz7xYHDWQsIPY= +github.com/hashicorp/serf v0.10.1/go.mod h1:yL2t6BqATOLGc5HF7qbFkTfXoPIY0WZdWHfEvMqbG+4= +github.com/holiman/uint256 v1.3.2 h1:a9EgMPSC1AAaj1SZL5zIQD3WbwTuHrMGOerLjGmM/TA= +github.com/holiman/uint256 v1.3.2/go.mod h1:EOMSn4q6Nyt9P6efbI3bueV4e1b3dGlUCXeiRV4ng7E= +github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= +github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= +github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= +github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= +github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= +github.com/miekg/dns v1.1.41 h1:WMszZWJG0XmzbK9FEmzH2TVcqYzFesusSIB41b8KHxY= +github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI= +github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= +github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= +github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= +github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= +github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= +github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= +github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= +github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= +github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= +github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= +github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= +github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= +github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= +github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= +golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3H3cr1v9wB50oz8l4C4h62xy7jSTY= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= +golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1/go.mod h1:9tjilg8BloeKEkVJvy7fQ90B1CfIiPueXVOjqfkSzI8= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190924154521-2837fb4f24fe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210303074136-134d130e1a04/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190907020128-2ca718005c18/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/consul-postgres-ha/bootstrap-secrets/main.go b/consul-postgres-ha/bootstrap-secrets/main.go new file mode 100644 index 0000000..3813440 --- /dev/null +++ b/consul-postgres-ha/bootstrap-secrets/main.go @@ -0,0 +1,342 @@ +// bootstrap-secrets — stage 4 init container. +// +// One-shot. Runs to completion before any other service starts on a CVM. +// Responsibilities: +// +// 1. Use the dstack Go SDK to learn this CVM's identity (AppID, +// InstanceID, ComposeHash) and to derive cluster-wide secrets +// (gossip key, TURN secret, Connect-CA seed) deterministically +// from the app's KMS-bound key. Same secrets across every +// replica of the same phala_app, never visible to the deploy +// host. +// +// 2. Claim a stable ordinal (0..N-1) for this CVM by atomic-CAS-ing +// a slot in Consul KV (workers only — the coordinator is always +// ordinal 0). The InstanceID is the slot's permanent owner so +// restarts re-find their own slot. +// +// 3. Write everything dependent services need to a tmpfs volume +// shared via compose. /run/secrets/{gossip,turn,ca-seed} are +// mode-0400 binary blobs; /run/instance/info.json carries the +// identity + ordinal + computed per-protocol ports. +// +// 4. Exit 0 so compose `depends_on` with +// `condition: service_completed_successfully` can release the +// next services. +// +// The keystone of the stage-4 design is here: this is the only piece +// that holds plaintext secret material, and it does so entirely +// inside the TEE. The deploy host never sees them. + +package main + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "flag" + "fmt" + "log" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + dstack "github.com/Dstack-TEE/dstack/sdk/go/dstack" + consulapi "github.com/hashicorp/consul/api" +) + +func main() { + flag.Parse() + cfg := loadConfig() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // 1. Identity from dstack SDK. + client := dstack.NewDstackClient() + info, err := client.Info(ctx) + if err != nil { + log.Fatalf("dstack Info: %v", err) + } + log.Printf("dstack: app_id=%s instance_id=%s compose_hash=%s", + info.AppID, info.InstanceID, shortHash(info.ComposeHash)) + + // 2. Derive cluster-wide secrets. Same path/purpose triple + // returns the same 32 bytes on every replica of this app. + // Each secret has a name, a derivation path, and a serialisation + // format that matches what its consumer expects: + // gossip: consul agent's -encrypt= wants base64. + // turn: coturn's --static-auth-secret takes any string; + // we use hex for compactness. + // ca-seed: just bytes we re-derive into a Connect CA root; + // hex is fine. + derived := []struct { + name, path, format string + }{ + {"gossip", "dstack-mesh/gossip", "base64"}, + {"turn", "dstack-mesh/turn", "hex"}, + {"ca-seed", "dstack-mesh/connect-ca", "hex"}, + // Patroni superuser + replication passwords. Both are random + // 32-byte hex strings; identical on every replica because all + // peers derive against the same path + ClusterName. + {"patroni-superuser", "dstack-mesh/patroni-superuser", "hex"}, + {"patroni-replication", "dstack-mesh/patroni-replication", "hex"}, + } + for _, d := range derived { + seed, err := client.GetKey(ctx, d.path, cfg.ClusterName, "secp256k1") + if err != nil { + log.Fatalf("GetKey %s: %v", d.path, err) + } + keyBytes, err := seed.DecodeKey() + if err != nil { + log.Fatalf("decode %s: %v", d.path, err) + } + if err := writeSecretEncoded("/run/secrets/"+d.name, keyBytes, d.format); err != nil { + log.Fatalf("write %s: %v", d.name, err) + } + log.Printf("derived %s (%d bytes, %s-encoded) -> /run/secrets/%s", + d.name, len(keyBytes), d.format, d.name) + } + + // 3. Ordinal selection. + // Three sources, in order of preference: + // a. WORKER_ORDINAL or COORDINATOR_ORDINAL env (set by + // cluster.tf when each peer is its own phala_app — + // sidesteps the Consul-bootstrap chicken-and-egg). + // b. Consul KV CAS (the multi-server / dynamic case once + // phala-cloud#243 lets us pass per-instance env to a + // replicas:N app). + ordinal := 0 + switch { + case cfg.WorkerOrdinal > 0: + ordinal = cfg.WorkerOrdinal + log.Printf("ordinal from WORKER_ORDINAL env: %d", ordinal) + case cfg.HasCoordinatorOrdinal: + // Coordinator ordinal can be 0 (the first coord), so we use + // a separate "set?" flag instead of >0. + ordinal = cfg.CoordinatorOrdinal + log.Printf("ordinal from COORDINATOR_ORDINAL env: %d", ordinal) + default: + var err error + ordinal, err = claimOrdinal(cfg, info.InstanceID) + if err != nil { + log.Fatalf("ordinal claim: %v", err) + } + } + + // 4. Compute per-protocol ports for this ordinal. + ports := computePorts(cfg.ProtocolBases, ordinal) + + instance := InstanceInfo{ + InstanceID: info.InstanceID, + AppID: info.AppID, + ComposeHash: info.ComposeHash, + ClusterName: cfg.ClusterName, + Role: cfg.Role, + Ordinal: ordinal, + Ports: ports, + } + if err := writeJSON("/run/instance/info.json", instance); err != nil { + log.Fatalf("write instance info: %v", err) + } + + log.Printf("bootstrap done: role=%s ordinal=%d ports=%v", cfg.Role, ordinal, ports) +} + +// ============================================================================= +// config +// ============================================================================= + +type Config struct { + ClusterName string + Role string // coordinator | worker + ConsulHTTPAddr string // 127.0.0.1: on the local agent + ExpectedReplicas int // upper bound on ordinal slots to try + ProtocolBases map[string]int + WorkerOrdinal int // optional, set by cluster.tf per-worker (>0) + CoordinatorOrdinal int // optional, set by cluster.tf per-coordinator (>=0) + HasCoordinatorOrdinal bool // distinguishes ordinal=0 from "unset" +} + +func loadConfig() *Config { + cfg := &Config{ + ClusterName: mustEnv("CLUSTER_NAME"), + Role: mustEnv("ROLE"), + ConsulHTTPAddr: os.Getenv("CONSUL_HTTP_ADDR"), // empty for coordinator + ExpectedReplicas: 16, // generous upper bound + } + if v := os.Getenv("WORKER_ORDINAL"); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n < 1 { + log.Fatalf("WORKER_ORDINAL invalid: %q", v) + } + cfg.WorkerOrdinal = n + } + if v := os.Getenv("COORDINATOR_ORDINAL"); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n < 0 { + log.Fatalf("COORDINATOR_ORDINAL invalid: %q", v) + } + cfg.CoordinatorOrdinal = n + cfg.HasCoordinatorOrdinal = true + } + // PROTOCOL_BASES: JSON object of name -> base port. + rawBases := mustEnv("PROTOCOL_BASES") + if err := json.Unmarshal([]byte(rawBases), &cfg.ProtocolBases); err != nil { + log.Fatalf("PROTOCOL_BASES not valid JSON: %v", err) + } + if r := os.Getenv("EXPECTED_REPLICAS"); r != "" { + n, err := strconv.Atoi(r) + if err != nil || n <= 0 { + log.Fatalf("EXPECTED_REPLICAS invalid: %v", err) + } + cfg.ExpectedReplicas = n + } + return cfg +} + +func mustEnv(k string) string { + v := os.Getenv(k) + if v == "" { + log.Fatalf("missing env %s", k) + } + return v +} + +// ============================================================================= +// ordinal claim — Consul KV CAS +// ============================================================================= + +// claimOrdinal walks slot indices 0..ExpectedReplicas-1, finds either +// +// - a slot whose value is already this InstanceID (we're rejoining), or +// - the lowest empty slot (CAS-claim it). +// +// First match wins. Returns the slot index. Slot ownership is +// permanent for the InstanceID's lifetime; cleanup of stale slots +// (when an instance is permanently retired) is a separate operator +// task — note in stage-4 README. +func claimOrdinal(cfg *Config, instanceID string) (int, error) { + if cfg.ConsulHTTPAddr == "" { + return 0, fmt.Errorf("CONSUL_HTTP_ADDR required for non-coordinator role") + } + cli, err := consulapi.NewClient(&consulapi.Config{ + Address: cfg.ConsulHTTPAddr, + Scheme: "http", + }) + if err != nil { + return 0, fmt.Errorf("consul client: %w", err) + } + kv := cli.KV() + + keyPrefix := fmt.Sprintf("cluster/%s/slots", cfg.ClusterName) + + // Retry the whole walk a few times — pollLoop racing with peers + // could cause CAS misses; on a miss, try the next slot or restart + // the walk. + for attempt := 0; attempt < 20; attempt++ { + for i := 0; i < cfg.ExpectedReplicas; i++ { + key := fmt.Sprintf("%s/%d", keyPrefix, i) + existing, _, err := kv.Get(key, nil) + if err != nil { + return 0, fmt.Errorf("kv get %s: %w", key, err) + } + switch { + case existing != nil && string(existing.Value) == instanceID: + log.Printf("rejoining slot %d (already owned)", i) + return i, nil + case existing == nil: + ok, _, err := kv.CAS(&consulapi.KVPair{ + Key: key, + Value: []byte(instanceID), + ModifyIndex: 0, + }, nil) + if err != nil { + return 0, fmt.Errorf("kv cas %s: %w", key, err) + } + if ok { + log.Printf("claimed slot %d (fresh)", i) + return i, nil + } + // CAS lost the race; some other peer claimed + // this slot first. Try the next slot. + default: + // owned by another instance; skip + } + } + // Exhausted slots without claiming or rejoining; either the + // cluster is over-replicated or there's a stale slot. Sleep + // briefly and retry — gives a slot a chance to clear if a + // peer is in transient state. + time.Sleep(2 * time.Second) + } + return 0, fmt.Errorf("no available slot in cluster %q (max=%d) — cluster over-replicated or has stale slots", + cfg.ClusterName, cfg.ExpectedReplicas) +} + +// ============================================================================= +// instance info + tmpfs writes +// ============================================================================= + +type InstanceInfo struct { + InstanceID string `json:"instance_id"` + AppID string `json:"app_id"` + ComposeHash string `json:"compose_hash"` + ClusterName string `json:"cluster_name"` + Role string `json:"role"` + Ordinal int `json:"ordinal"` + Ports map[string]int `json:"ports"` +} + +func computePorts(bases map[string]int, ordinal int) map[string]int { + out := make(map[string]int, len(bases)) + for name, base := range bases { + out[name] = base + ordinal + } + return out +} + +// writeSecretEncoded writes b to path with the given encoding. 0444 +// because non-root sibling containers (coturn) need to read these; +// the trust boundary is the TEE itself, not the unix uid. +func writeSecretEncoded(path string, b []byte, format string) error { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + switch format { + case "raw": + return os.WriteFile(path, b, 0o444) + case "hex": + return os.WriteFile(path, []byte(hex.EncodeToString(b)), 0o444) + case "base64": + return os.WriteFile(path, []byte(base64.StdEncoding.EncodeToString(b)), 0o444) + default: + return fmt.Errorf("unknown encoding %q", format) + } +} + +func writeJSON(path string, v any) error { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + b, err := json.MarshalIndent(v, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, append(b, '\n'), 0o444) +} + +func shortHash(s string) string { + if len(s) < 12 { + return s + } + return s[:12] + "..." +} + +// silence unused import on Linux if go vet complains about strings +var _ = strings.HasPrefix +var _ = sha256.New diff --git a/consul-postgres-ha/cluster-example/cluster.tf b/consul-postgres-ha/cluster-example/cluster.tf new file mode 100644 index 0000000..70d80cd --- /dev/null +++ b/consul-postgres-ha/cluster-example/cluster.tf @@ -0,0 +1,239 @@ +# Stage 4 — example cluster. +# +# This whole HCL file IS the cluster definition. To bring up a 4-instance +# Consul + Connect mesh on dstack: +# +# PHALA_CLOUD_API_KEY=$(your token) terraform apply +# +# Adding a worker is a `replicas` bump on phala_app.worker; terraform +# apply propagates the new PEERS_JSON to every CVM via in-place env +# update (no destroy/recreate; disks survive — verified in +# stage4-experiments/disk-persistence/). + +terraform { + required_version = ">= 1.5" + required_providers { + phala = { + source = "phala-network/phala" + # 0.2.0-beta.3 is the first version where in-place env-block + # updates actually take effect — earlier betas silently no-op'd + # them (Phala-Network/phala-cloud#246, fixed in + # Phala-Network/terraform-provider-phala#8). Pin exactly because + # Terraform's `>=` operator doesn't include later prerelease + # versions; bump this line by hand when a newer beta ships. + version = "0.2.0-beta.3" + } + } +} + +provider "phala" {} + +# ---------- Cluster knobs ---------- + +variable "cluster_name" { + type = string + default = "demo" +} + +variable "coordinator_replicas" { + type = number + default = 3 + description = "Number of voting Consul-server CVMs. 3 gives fault tolerance of 1; 5 of 2." +} + +variable "worker_replicas" { + type = number + default = 3 +} + +variable "gateway_domain" { + type = string + description = "Phala dstack gateway domain (e.g. dstack-pha-prod5.phala.network)" +} + +# Image references. Gap 2 collapsed bootstrap-secrets, mesh-conn, the +# legacy keepalive placeholder, and the old envoy-only sidecar into +# one `mesh_sidecar_image` (consul-postgres-ha-mesh-sidecar) — workers +# and coordinators both reference it and the entrypoint dispatches on +# ROLE. The `signaling` image is still published by CI (used by the +# external Vultr coordinator), but no dstack CVM in this cluster +# references it, so it isn't a Terraform input here. +variable "mesh_sidecar_image" { type = string } +variable "webdemo_image" { type = string } +variable "patroni_image" { type = string } + +# External coordinator (Vultr coturn + signaling box). Used until +# Phala admin enables UDP ingress on dstack apps; once that lands we +# can host coturn + signaling inside the dstack mesh and drop these +# external_* vars. The dstack-coordinator compose no longer carries +# unused local copies of those services. +variable "external_coordinator_host" { type = string } +variable "external_signaling_url" { type = string } +variable "external_turn_secret" { + type = string + sensitive = true +} + +# ---------- Protocol port plan ---------- + +locals { + # Index i is the same protocol on every peer; the per-peer port for + # protocol `name` at ordinal `n` is base + n. mesh-conn reads + # /run/instance/info.json for this peer's actual ports (computed by + # bootstrap-secrets from PROTOCOL_BASES + the ordinal it claimed). + protocol_bases = { + serf_lan = 18000 + server_rpc = 18100 + http_api = 18200 + grpc = 18300 + webdemo = 18500 + sidecar_public = 18600 + postgres = 18700 # Patroni-managed PostgreSQL listen + patroni_rest = 18800 # Patroni REST API (peer health, leader query) + } + + # The full peer list, identical on every CVM. Coordinators occupy + # ordinals 0..C-1 (where C = coordinator_replicas), workers fill + # ordinals C..C+W-1. PEERS_JSON is what mesh-conn consumes; the + # role-ordinal pair is what each peer self-identifies as in its + # bootstrap-secrets-derived /run/instance/info.json (mesh-conn then + # reads "-" as its self ID). + peers = concat( + [ + for i in range(var.coordinator_replicas) : { + id = "coordinator-${i}" + ordinal = i + role = "coordinator" + } + ], + [ + for i in range(var.worker_replicas) : { + # ID must match mesh-conn's self_id, which is `role-ordinal`, + # NOT slot. Workers occupy ordinals C..C+W-1. + id = "worker-${i + var.coordinator_replicas}" + ordinal = i + var.coordinator_replicas + role = "worker" + } + ], + ) + + peers_json = jsonencode([ + for p in local.peers : { + id = p.id + ports = [for proto, base in local.protocol_bases : base + p.ordinal] + } + ]) + + protocol_bases_json = jsonencode(local.protocol_bases) + + # Comma-separated lists of coordinator-ordinal-shifted ports. Workers + # use COORDINATOR_SERF_PORTS to retry-join EVERY coordinator, and + # COORDINATOR_HTTP_PORTS to pick ANY coordinator's HTTP API for + # KV-CAS bootstrapping. Coordinators use COORDINATOR_SERF_PORTS to + # gossip-join their server peers (consul -bootstrap-expect=N). + coordinator_serf_ports = join(",", [for i in range(var.coordinator_replicas) : tostring(local.protocol_bases.serf_lan + i)]) + coordinator_http_ports = join(",", [for i in range(var.coordinator_replicas) : tostring(local.protocol_bases.http_api + i)]) + + # First coordinator's HTTP port — used as a single endpoint for the + # consul-ui output and for legacy single-coord callers. + coordinator_http_port_first = local.protocol_bases.http_api + 0 +} + +# ---------- Coordinator ---------- + +resource "phala_app" "coordinator" { + # One phala_app per coordinator (with replicas:1) — same per-resource + # ordinal pattern as workers, same chicken-and-egg sidestep + # (bootstrap-secrets needs to know its own ordinal before Consul is + # reachable, since Consul is on the coordinators themselves). + for_each = { for i in range(var.coordinator_replicas) : tostring(i) => i } + + name = "${var.cluster_name}-coordinator-${each.key}" + size = "tdx.small" + region = "US-WEST-1" + disk_size = 20 + replicas = 1 + storage_fs = "zfs" # MUST pin (terraform-provider-phala#5) + docker_compose = file("${path.module}/../compose/coordinator.yaml") + + env = { + CLUSTER_NAME = var.cluster_name + PROTOCOL_BASES = local.protocol_bases_json + PEERS_JSON = local.peers_json + COORDINATOR_ORDINAL = tostring(each.value) + BOOTSTRAP_EXPECT = tostring(var.coordinator_replicas) + COORDINATOR_SERF_PORTS = local.coordinator_serf_ports + SIGNALING_URL = var.external_signaling_url + TURN_HOST = var.external_coordinator_host + TURN_SHARED_SECRET = var.external_turn_secret + MESH_SIDECAR_IMAGE = var.mesh_sidecar_image + } + + listed = false + public_logs = true + public_sysinfo = false + + wait_for_ready = true + wait_timeout_seconds = 600 +} + +# ---------- Workers ---------- + +resource "phala_app" "worker" { + # One phala_app per worker (with replicas:1) instead of a single + # app with replicas:N. Reason: each worker needs its OWN ordinal + # passed in via env so bootstrap-secrets can write the correct + # /run/instance/info.json without a Consul KV CAS round-trip. + # The CAS path has a chicken-and-egg: workers need Consul to + # claim an ordinal, but Consul (on the coordinator) is reached + # via mesh-conn, which depends on bootstrap-secrets having + # finished. Per-worker resources sidestep this entirely. + # + # Once phala-cloud#243 lands phala_app_instance + per-instance + # env, this reverts to one resource with replicas:N + per-instance + # env block. + # Key is the worker's 1-based slot (used in the CVM name); value is + # the cluster-wide ordinal (= slot + coordinator_replicas, since + # coordinators occupy ordinals 0..C-1). + for_each = { for i in range(var.worker_replicas) : tostring(i + 1) => i + var.coordinator_replicas } + + name = "${var.cluster_name}-worker-${each.key}" + size = "tdx.small" + region = "US-WEST-1" + disk_size = 20 + replicas = 1 + storage_fs = "zfs" + docker_compose = file("${path.module}/../compose/worker.yaml") + + env = { + CLUSTER_NAME = var.cluster_name + PROTOCOL_BASES = local.protocol_bases_json + PEERS_JSON = local.peers_json + WORKER_ORDINAL = tostring(each.value) + EXPECTED_REPLICAS = var.worker_replicas + var.coordinator_replicas + COORDINATOR_SERF_PORTS = local.coordinator_serf_ports + COORDINATOR_HTTP_PORTS = local.coordinator_http_ports + SIGNALING_URL = var.external_signaling_url + TURN_HOST = var.external_coordinator_host + TURN_SHARED_SECRET = var.external_turn_secret + MESH_SIDECAR_IMAGE = var.mesh_sidecar_image + WEBDEMO_IMAGE = var.webdemo_image + PATRONI_IMAGE = var.patroni_image + } + + listed = false + public_logs = true + public_sysinfo = false + + wait_for_ready = true + wait_timeout_seconds = 600 + + depends_on = [phala_app.coordinator] +} + +output "coordinator_app_ids" { value = { for k, c in phala_app.coordinator : k => c.app_id } } +output "worker_app_ids" { value = { for k, w in phala_app.worker : k => w.app_id } } +output "consul_ui" { + # Any coordinator's HTTP port serves the UI. Pick coord-0 by convention. + value = "https://${phala_app.coordinator["0"].app_id}-${local.coordinator_http_port_first}s.${var.gateway_domain}/ui" +} diff --git a/consul-postgres-ha/cluster-example/rollout.sh b/consul-postgres-ha/cluster-example/rollout.sh new file mode 100755 index 0000000..ef9fa7a --- /dev/null +++ b/consul-postgres-ha/cluster-example/rollout.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# Stage 4 — workload-aware rolling update driver. +# +# Until phala-cloud#243 lands `phala_app.update_policy`, the platform's +# in-place app update touches all replicas in unspecified order. That's +# fine for stateless workers but dangerous for the Consul quorum (and +# any other leader-bearing workload). This script drives the rollout +# from outside Terraform with workload-aware drains between replica +# updates. +# +# Usage: +# ./rollout.sh # full rolling update (apply per-app, gated) +# ./rollout.sh --app worker # roll only the worker app +# ./rollout.sh --plan # show what would happen, don't apply +# +# Requires: +# PHALA_CLOUD_API_KEY env (or terraform `phala` provider config) +# terraform CLI on PATH +# A working overlay (mesh-conn + Consul) so we can query cluster health. + +set -euo pipefail + +# ---------- Config ---------- + +CLUSTER_NAME="${CLUSTER_NAME:-demo}" +GATEWAY_DOMAIN="${GATEWAY_DOMAIN:-dstack-pha-prod5.phala.network}" +COORDINATOR_HTTP_PORT="${COORDINATOR_HTTP_PORT:-18200}" +MIN_READY_SECONDS="${MIN_READY_SECONDS:-30}" +HEALTH_TIMEOUT_SECONDS="${HEALTH_TIMEOUT_SECONDS:-180}" + +PLAN_ONLY=false +APP_FILTER="" +while [[ $# -gt 0 ]]; do + case "$1" in + --plan) PLAN_ONLY=true; shift ;; + --app) APP_FILTER="$2"; shift 2 ;; + -h|--help) + sed -n '2,30p' "$0" + exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +# ---------- Helpers ---------- + +CONSUL_BASE="" + +resolve_consul_base() { + local coord_id + coord_id=$(terraform output -raw coordinator_app_id 2>/dev/null || true) + if [[ -z "$coord_id" ]]; then + echo "ERROR: terraform output coordinator_app_id failed; run terraform apply at least once" >&2 + exit 1 + fi + CONSUL_BASE="https://${coord_id}-${COORDINATOR_HTTP_PORT}s.${GATEWAY_DOMAIN}" +} + +consul_members_alive() { + curl -sf "${CONSUL_BASE}/v1/agent/members" \ + | jq -r '[.[] | select(.Status==1)] | length' 2>/dev/null \ + || echo 0 +} + +consul_leader_present() { + local lead + lead=$(curl -sf "${CONSUL_BASE}/v1/status/leader" 2>/dev/null || echo '""') + [[ "$lead" != '""' && -n "$lead" ]] +} + +wait_for_quorum_healthy() { + local expected="$1" + local deadline=$(( $(date +%s) + HEALTH_TIMEOUT_SECONDS )) + while (( $(date +%s) < deadline )); do + local alive + alive=$(consul_members_alive) + if [[ "$alive" == "$expected" ]] && consul_leader_present; then + sleep "$MIN_READY_SECONDS" + # re-check after the cool-off + alive=$(consul_members_alive) + if [[ "$alive" == "$expected" ]] && consul_leader_present; then + return 0 + fi + fi + sleep 5 + done + return 1 +} + +snapshot_consul() { + local label="$1" + local snap_dir="snapshots" + mkdir -p "$snap_dir" + local f="$snap_dir/${label}-$(date +%Y%m%d-%H%M%S).snap" + if curl -sf -X PUT "${CONSUL_BASE}/v1/snapshot" -o "$f"; then + echo "snapshot saved: $f" + else + echo "WARN: snapshot save failed (continuing)" >&2 + fi +} + +# Transfer Raft leader off the named node if it's currently leader. +# No-op if some other node is leader. +maybe_transfer_leader() { + local current_node="$1" + local lead + lead=$(curl -sf "${CONSUL_BASE}/v1/status/leader" 2>/dev/null | jq -r .) + echo "current leader: $lead; this node: $current_node" + # Heuristic: if leader contains current_node's RPC port, transfer. + if [[ "$lead" == *":${current_node}"* ]]; then + echo "transferring leadership away from $current_node" + curl -sf -X POST "${CONSUL_BASE}/v1/operator/raft/transfer-leader" >/dev/null \ + || echo "WARN: leader transfer rejected (likely single-server cluster)" + sleep 5 + fi +} + +# ---------- Main ---------- + +resolve_consul_base +echo "Consul UI base: $CONSUL_BASE" + +EXPECTED=$(curl -sf "${CONSUL_BASE}/v1/agent/members" 2>/dev/null | jq -r 'length' || echo 0) +echo "current members alive: $(consul_members_alive) / $EXPECTED" + +if ! consul_leader_present; then + echo "ERROR: cluster has no leader; refusing to roll" >&2 + exit 1 +fi + +snapshot_consul "pre-rollout" + +if $PLAN_ONLY; then + terraform plan + exit 0 +fi + +# For now: a single `terraform apply` triggers Phala's in-place app +# update for every changed app. Until per-instance updates are +# available (phala-cloud#243), we can only gate at the app boundary. +# +# Apply order: workers first (stateless mostly; if one fails we still +# have the others), coordinator last (it's the Consul server, biggest +# blast radius). + +APPS_TO_ROLL=() +if [[ -z "$APP_FILTER" || "$APP_FILTER" == "worker" ]]; then + APPS_TO_ROLL+=("phala_app.worker") +fi +if [[ -z "$APP_FILTER" || "$APP_FILTER" == "coordinator" ]]; then + APPS_TO_ROLL+=("phala_app.coordinator") +fi + +for app in "${APPS_TO_ROLL[@]}"; do + echo "=== applying ${app} ===" + terraform apply -auto-approve -target="${app}" + echo "=== waiting for cluster to settle ===" + if ! wait_for_quorum_healthy "$EXPECTED"; then + echo "ERROR: cluster did not return to all-alive within ${HEALTH_TIMEOUT_SECONDS}s after ${app}" >&2 + echo "snapshot saved at start of rollout; restore via consul snapshot restore" >&2 + exit 1 + fi + echo "=== ${app}: green ===" +done + +echo "rollout complete; final state:" +curl -sf "${CONSUL_BASE}/v1/agent/members" | jq -r '.[] | .Name + " " + (.Status|tostring)' diff --git a/consul-postgres-ha/cluster-example/terraform.tfvars.example b/consul-postgres-ha/cluster-example/terraform.tfvars.example new file mode 100644 index 0000000..f690c97 --- /dev/null +++ b/consul-postgres-ha/cluster-example/terraform.tfvars.example @@ -0,0 +1,27 @@ +# Copy to terraform.tfvars and fill in. +# +# Defaults below point at the GHCR-published, Sigstore-attested images +# produced by .github/workflows/consul-postgres-ha-publish.yml. Pin to a +# specific commit by replacing `:latest` with `:sha-` (40-char +# git sha) — preferred for prod since `:latest` floats. Verify a tag +# came from this repo with: +# +# gh attestation verify \ +# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ +# --repo Dstack-TEE/dstack-examples +# +# For dev iteration, replace any single line with a `ttl.sh/...:24h` +# tag from `docker push ttl.sh/-$(date +%s):24h`. + +cluster_name = "demo" +coordinator_replicas = 3 +worker_replicas = 3 +gateway_domain = "dstack-pha-prod5.phala.network" + +mesh_sidecar_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest" +webdemo_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-webdemo:latest" +patroni_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-patroni:latest" + +# external_coordinator_host = "" +# external_signaling_url = "http://:7000" +# external_turn_secret = "" diff --git a/consul-postgres-ha/compose/coordinator.yaml b/consul-postgres-ha/compose/coordinator.yaml new file mode 100644 index 0000000..b3fc3e0 --- /dev/null +++ b/consul-postgres-ha/compose/coordinator.yaml @@ -0,0 +1,74 @@ +# Coordinator compose template — Gap 2 single-sidecar layout. +# +# A coordinator CVM runs exactly one container: the consolidated +# `sidecar` image with ROLE=coordinator. Inside it: +# +# bootstrap-secrets one-shot init — derives per-CVM secrets from +# the dstack KMS, claims COORDINATOR_ORDINAL, +# writes /run/instance/info.json. +# mesh-conn QUIC-on-pion/ICE overlay (same as on workers). +# consul Server agent (`-server -bootstrap-expect=N -ui`), +# joins peer coordinators via mesh-conn-forwarded +# loopback ports. envoy is NOT started here +# (coordinators don't host a Connect-mTLS workload). +# +# coturn + signaling that earlier coordinator templates carried have +# been removed: the cluster uses an external (Vultr) coordinator box +# for both — see consul-postgres-ha/coordinator/docker-compose.yaml — +# configured into each peer's mesh-conn via SIGNALING_URL / TURN_HOST / +# TURN_SHARED_SECRET. The dstack-coordinator's local copies were never +# reachable from outside (Phala dstack apps don't have UDP ingress +# yet), so they were dead code burning CPU. When/if UDP ingress lands, +# re-adding them is one small PR. +# +# Per-CVM secrets policy: nothing on the persisted disk holds secret +# material. /run/secrets/* is tmpfs (gone on reboot, re-derived on +# next boot from getKey()). /consul/data IS persisted but only +# contains catalog, KV, and Raft state — no gossip key material. + +services: + sidecar: + image: ${MESH_SIDECAR_IMAGE} + network_mode: host + restart: on-failure + environment: + - ROLE=coordinator + - CLUSTER_NAME=${CLUSTER_NAME} + - PROTOCOL_BASES=${PROTOCOL_BASES} + - PEERS_JSON=${PEERS_JSON} + # COORDINATOR_ORDINAL is per-CVM (0..N-1); makes bootstrap-secrets + # write the right /run/instance/info.json without needing Consul + # KV (which itself runs on the coordinators — chicken-and-egg). + - COORDINATOR_ORDINAL=${COORDINATOR_ORDINAL} + - BOOTSTRAP_EXPECT=${BOOTSTRAP_EXPECT} + - COORDINATOR_SERF_PORTS=${COORDINATOR_SERF_PORTS} + # External coordinator path — coordinator's mesh-conn uses the + # same Vultr coturn + signaling that workers do, so peer-pair + # ICE rendezvous happens in a single shared place. + - SIGNALING_URL=${SIGNALING_URL} + - TURN_HOST=${TURN_HOST} + - TURN_SHARED_SECRET=${TURN_SHARED_SECRET} + # See worker.yaml for the rationale on MESH_CONN_RELAY_ONLY. + - MESH_CONN_RELAY_ONLY=${MESH_CONN_RELAY_ONLY:-} + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock:ro + - /tmp/dstack-runtime/secrets:/run/secrets + - /tmp/dstack-runtime/instance:/run/instance + - consul-data:/consul/data + healthcheck: + test: ["CMD-SHELL", "test -s /run/instance/info.json"] + interval: 2s + timeout: 1s + retries: 60 + start_period: 5s + +volumes: + consul-data: + +# Shared state goes through HOST BIND MOUNTS, not named docker volumes +# — on the dstack platform we run on, named volumes don't share data +# across containers (the second container always sees an empty volume +# even after the first wrote to it; filed as a phala-cloud issue). +# Bind mounts to the CVM's /tmp work fine. /tmp is ephemeral inside +# the TEE; secrets are re-derived deterministically from getKey() on +# every boot, so the on-disk copy is effectively a per-boot cache. diff --git a/consul-postgres-ha/compose/worker.yaml b/consul-postgres-ha/compose/worker.yaml new file mode 100644 index 0000000..2a0e345 --- /dev/null +++ b/consul-postgres-ha/compose/worker.yaml @@ -0,0 +1,99 @@ +# Worker compose template — Gap 2 single-sidecar layout. +# +# A "worker" CVM ends up running exactly three containers: +# +# sidecar bundled platform plumbing — bootstrap-secrets, +# mesh-conn, consul (client), and envoy (Connect data +# plane). See consul-postgres-ha-sidecar's README. +# patroni the workload — Postgres + Patroni leader/replica. +# webdemo tiny example app sitting on the mesh; swap for your +# own service when adapting this template. +# +# Coordinator CVMs run their own compose (compose/coordinator.yaml) +# without patroni/webdemo, since they only host the Consul server +# quorum. + +services: + sidecar: + image: ${MESH_SIDECAR_IMAGE} + network_mode: host + restart: on-failure + environment: + - ROLE=worker + - CLUSTER_NAME=${CLUSTER_NAME} + - PROTOCOL_BASES=${PROTOCOL_BASES} + - WORKER_ORDINAL=${WORKER_ORDINAL} + - EXPECTED_REPLICAS=${EXPECTED_REPLICAS} + - PEERS_JSON=${PEERS_JSON} + - SIGNALING_URL=${SIGNALING_URL} + - TURN_HOST=${TURN_HOST} + - TURN_SHARED_SECRET=${TURN_SHARED_SECRET} + - COORDINATOR_SERF_PORTS=${COORDINATOR_SERF_PORTS} + # COORDINATOR_HTTP_PORTS is comma-separated; bootstrap-secrets + # picks the first reachable one to talk to Consul KV — only used + # if WORKER_ORDINAL is unset (legacy CAS-claim fallback). + - COORDINATOR_HTTP_PORTS=${COORDINATOR_HTTP_PORTS} + # MESH_CONN_RELAY_ONLY=1 forces ICE to gather only Relay + # candidates, routing all peer traffic through the coturn server. + # Default off because direct candidates work; flip on if a + # deployment hits worker↔worker direct-pair instability. + - MESH_CONN_RELAY_ONLY=${MESH_CONN_RELAY_ONLY:-} + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock:ro + - /tmp/dstack-runtime/secrets:/run/secrets + - /tmp/dstack-runtime/instance:/run/instance + - consul-data:/consul/data + # Healthy = bootstrap-secrets has finished and info.json is in + # place. patroni/webdemo gate on this so they don't FATAL-restart + # in a loop while the sidecar is still booting. + healthcheck: + test: ["CMD-SHELL", "test -s /run/instance/info.json"] + interval: 2s + timeout: 1s + retries: 60 + start_period: 5s + + patroni: + image: ${PATRONI_IMAGE} + network_mode: host + restart: on-failure + # CLUSTER_NAME drives Patroni's `scope` — every peer's patroni + # must use the same value to land in the same cluster. The rest + # is read from /run/instance/info.json by entrypoint.sh. + environment: + - CLUSTER_NAME=${CLUSTER_NAME} + volumes: + - /tmp/dstack-runtime/instance:/run/instance:ro + - /tmp/dstack-runtime/secrets:/run/secrets:ro + - patroni-pgdata:/var/lib/patroni + depends_on: + sidecar: + condition: service_healthy + + webdemo: + image: ${WEBDEMO_IMAGE} + network_mode: host + restart: unless-stopped + entrypoint: ["/bin/sh", "-c"] + command: + - | + set -e + export PEER_ID=$$(jq -r '.role + "-" + (.ordinal|tostring)' /run/instance/info.json) + export WEBDEMO_PORT=$$(jq -r '.ports.webdemo' /run/instance/info.json) + export SIDECAR_PORT=$$(jq -r '.ports.sidecar_public' /run/instance/info.json) + export CONSUL_HTTP_ADDR=127.0.0.1:$$(jq -r '.ports.http_api' /run/instance/info.json) + exec webdemo + volumes: + - /tmp/dstack-runtime/instance:/run/instance:ro + depends_on: + sidecar: + condition: service_healthy + +volumes: + consul-data: + patroni-pgdata: + +# Shared state goes through HOST BIND MOUNTS, not named docker +# volumes — see coordinator.yaml for the full note. Short version: +# named docker volumes don't share data across containers on the +# dstack platform we run on. Bind mounts to /tmp work fine. diff --git a/consul-postgres-ha/coordinator/docker-compose.yaml b/consul-postgres-ha/coordinator/docker-compose.yaml new file mode 100644 index 0000000..555a968 --- /dev/null +++ b/consul-postgres-ha/coordinator/docker-compose.yaml @@ -0,0 +1,36 @@ +# Runs on the user-provided public-IP host (NOT a dstack CVM). +# Provides STUN/TURN for ICE traversal and (for Phase 0) a tiny signaling +# broker so the two test CVMs can swap ICE candidates. +# +# Image tags are deliberately unpinned during Phase 0 iteration; pin to +# sha256 digests before stage-1 deploy. + +services: + coturn: + image: coturn/coturn:4.6 + network_mode: host + restart: unless-stopped + command: + - -n + - --realm=dstack-mesh + - --listening-port=3478 + - --tls-listening-port=5349 + - --min-port=49152 + - --max-port=49999 + - --use-auth-secret + - --static-auth-secret=${TURN_SHARED_SECRET} + - --no-cli + - --no-multicast-peers + - --log-file=stdout + - --simple-log + - --fingerprint + + signaling: + image: golang:1.22-alpine + restart: unless-stopped + working_dir: /src + volumes: + - ../phase0/icetest:/src + command: ["sh", "-c", "cd /src && go mod download && go run . -mode=signaling -addr=:7000"] + ports: + - "7000:7000" diff --git a/consul-postgres-ha/design/README.md b/consul-postgres-ha/design/README.md new file mode 100644 index 0000000..f258560 --- /dev/null +++ b/consul-postgres-ha/design/README.md @@ -0,0 +1,28 @@ +# Design docs — open work, intentionally separate from the user-facing docs + +This directory holds design briefs for **planned but not-yet-implemented** +work on `consul-postgres-ha`. Each doc is structured so an agent (or a +person) can pick it up cold and start implementing. + +The user-facing docs (`README.md`, `ARCHITECTURE.md`, `FAILOVER.md`, +`PUBLISHING.md`, `ROBUSTNESS.md`) describe what's *shipping today*. +This directory describes what's *next*. They're intentionally +separated so a user landing on the example doesn't get a roadmap +in their face. + +| Doc | What | +|---|---| +| [`attestation-admission.md`](attestation-admission.md) | Use dstack TEE attestation as the mesh-conn admission credential, replacing/augmenting the shared TURN HMAC. Phased plan: per-app-id first, Consul-KV-rooted policy later. | + +Each doc includes: + +- The current state and why it falls short +- What "done" looks like +- A concrete approach with a code/structure sketch +- Risks + mitigations +- Open questions for the implementing agent +- Success criteria +- Hand-off instructions + +When a doc's work lands, delete the doc (the implementation + the +user-facing docs are the surviving artifacts). diff --git a/consul-postgres-ha/design/attestation-admission.md b/consul-postgres-ha/design/attestation-admission.md new file mode 100644 index 0000000..c4fded2 --- /dev/null +++ b/consul-postgres-ha/design/attestation-admission.md @@ -0,0 +1,253 @@ +# Design: TEE attestation as the mesh admission credential + +**Status**: not started. Largest of the three open architectural +gaps. Worth starting with a design discussion (verify dstack SDK API +shape, confirm policy choice) before writing code. Branch off +`dstack-consul-ha-db`, PR back into it. + +## Why + +The whole point of running on dstack is that each CVM can produce a +hardware-attested measurement of what's executing inside it. Right +now mesh-conn doesn't *use* that — peer admission is gated by: + +- Holding the TURN HMAC secret (same on every peer in the cluster, + derived from dstack KMS by `bootstrap-secrets`). +- Completing pion/ICE handshake. +- Completing QUIC TLS handshake (self-signed cert, no peer-cert + verification — `InsecureSkipVerify: true`). + +A peer that **exfiltrates the TURN HMAC** can rejoin the mesh from +anywhere. A peer running a **rolled-back or compromised image** can +rejoin too — nobody asks "what are you running?" before admitting +the connection. That's a meaningful gap for a TEE-rooted system. + +## Goal + +Each peer's mesh-conn admission decision is gated on a fresh dstack +attestation that: + +1. **Signs a binding** between the peer's identity (peer-id, ICE + credentials, QUIC cert public key, …) and the TEE measurement. +2. **Chains to dstack's KMS root**, so we can verify off-chain that + it really came from a dstack CVM. +3. **Matches a policy** the cluster has agreed on (more on policy + below). + +A peer that can't produce such an attestation is rejected at the +QUIC handshake or first-stream layer, with a clear error. + +## Non-goals (for the first pass) + +- Replacing Consul Connect's mTLS at Layer 3. Consul intentions + govern *service-to-service* auth and stay as-is. This work governs + *peer-to-peer mesh admission* — a layer below. The two are + orthogonal. +- Replacing the TURN HMAC for *coturn auth*. Coturn still wants its + shared-secret. We're adding a **second** check at mesh-conn admit + time, not replacing the first. +- Periodic re-attestation. Phase 1 is "fresh attestation at each new + link establishment". A peer that rotates mid-session is out of + scope until phase 2. + +## Where attestation flows in the protocol + +Two natural insertion points: + +| Where | Pros | Cons | +|---|---|---| +| **(a) During ICE auth exchange via signaling broker** | Earliest possible reject. No ICE NAT-mapping wasted on rejected peers. | Signaling broker is *public* — attestation is exposed to anyone polling the broker. May be acceptable if attestations don't reveal sensitive state. | +| **(b) After QUIC handshake, as a "hello" message before the first user stream** | Private (encrypted under QUIC's TLS). Cleaner separation: ICE/QUIC stays oblivious, attestation is an application-layer concern. | A rejected peer wastes one ICE handshake + QUIC handshake. Fine in practice. | + +**Recommendation: (b).** The privacy benefit outweighs the +extra-handshake cost. Concrete shape: + +1. Both sides establish QUIC (existing flow). +2. Each side immediately opens a dedicated stream tagged + `streamAttest = 0xAA` (next free tag after `streamUDP=0x55`, + `streamTCP=0x33`). +3. Each side writes its attestation (a length-prefixed blob) to that + stream and closes its write half. +4. Each side reads the peer's attestation, verifies it against the + policy, and either: + - On accept: starts the existing `runAcceptLoop` / + `OpenStreamSync` flow. + - On reject: closes the QUIC connection with a documented error + code, and `runPeerLink` retries after backoff (no different + from any other failed handshake). + +The 3-byte stream header + tagging convention extends naturally; +nothing else in the wire format changes. + +## Policy choice — three candidates + +### (1) Per-image-digest allowlist + +The cluster admits peers running images whose digests match an +allowlist hardcoded into `bootstrap-secrets` or pulled from Consul KV. + +Pro: tightest. A leaked TURN HMAC alone can't get you in. + +Con: rolling upgrades require careful sequencing. While CVM-A is on +digest `D1` and CVM-B is on digest `D2`, they need to admit each +other. Either the allowlist always carries N+M digests during the +upgrade window, or the upgrade procedure pauses traffic between +not-yet-upgraded peers — both annoying. + +### (2) Per-app-id signature + +The cluster admits any peer whose attestation binds to the **same +dstack app-id** as our own. Identity = app-id; image-digest is not +checked. + +Pro: rolling upgrades trivial — N+1 image is still under the same +app-id, so peers admit each other unchanged. Simple to implement +(app-id is already in `/run/instance/info.json`). + +Con: a malicious image deployed under the same app-id (by whoever +controls the dstack-app deploy keys) can join. The TEE proves +"running in this app" but not "running this *binary*". + +### (3) Consul-KV-rooted policy + +The admission policy is a signed document stored in Consul KV under +e.g. `cluster//admission-policy`, signed by a key derived from +dstack KMS at cluster bootstrap. The document lists allowed +image-digests + a signature scheme for rotation. + +Pro: most expressive. Supports rolling upgrades (write a new policy +listing both digests, peers re-evaluate, after upgrade the old digest +is removed). Supports revocation (write a deny-list). + +Con: most complex. Bootstrapping the signing key safely is tricky +(if an attacker reaches Consul KV they can rewrite the policy). + +### Recommendation + +**Phase 1: per-app-id (option 2).** It's the smallest delta from +where we are, gives a meaningful security improvement (compromise of +TURN HMAC alone no longer admits arbitrary outsiders — they'd have +to also be inside *this* dstack-app), and doesn't fight rolling +upgrades. Document explicitly that this is "trust the deploy key, +not the image". + +**Phase 2: layer in image-digest verification with a policy doc in +Consul KV** (option 3) once we have someone driving the +deploy-time-signing story. + +Do **not** start with per-image-digest hardcoding (option 1) — the +upgrade pain bites immediately and there's no path forward. + +## Implementation phases + +### Phase 0 — plumbing (no enforcement) + +- Each peer fetches its attestation at startup via the dstack SDK. +- Add the attest-stream exchange (`streamAttest=0xAA` + length-prefix). +- Both sides log "got peer attestation, would accept" but admit + unconditionally. +- Adds an observability foothold without breaking anything. + +### Phase 1 — per-app-id enforcement + +- Each peer's attestation includes its app-id. +- Verify: signature chains to dstack KMS root, app-id matches our + own. +- Reject + log on mismatch. Add a regression test that constructs a + fake attestation with a wrong app-id and asserts rejection. + +### Phase 2 — Consul-KV admission policy + +- Coordinator-side: a small tool that signs a policy doc and writes + it to Consul KV. +- Peer-side: pull the policy doc on link admission, verify + signature, check peer's image-digest against the allowed list. +- Rolling-upgrade story: operator writes a new policy listing both + digests, applies cluster-wide image bump, then writes a policy + removing the old digest. + +### Phase 3 — re-attestation on link redial + +- The stream-of-attestation exchange runs every time `dialAndPump` + re-establishes a link, not just once at peer-id discovery. +- Already implicit in phase 1 (the exchange is per-handshake), but + worth listing because it means revocation propagates within + ~minute timescales, not "until this connection drops naturally". + +## Open questions for the design discussion + +1. **What's the actual dstack SDK API for fetching an attestation / + quote?** The user has worked with `dstack.NewDstackClient().Info()` + and `client.GetKey()` — assume there's an analogous + `client.GetQuote()` or `client.Attest()` but verify against the + SDK source. Determines the binding shape (what the attestation + commits to: nonce, peer-id, public key, …). + +2. **Attestation size + verification cost.** Dstack quotes are + typically a few KB and Verify is a few ms. If both are larger than + that, the attest exchange becomes a noticeable handshake-latency + tax. Worth measuring early. + +3. **What does the attestation actually bind to?** Possible + bindings: + - peer-id (our string, e.g. `worker-3`) — easy to spoof on its own + - QUIC cert public key — ties the attestation to *this* TLS + handshake. Best. + - Nonce from the peer — prevents replay across handshakes. Add to + the dialer's auth blob and have the dialee bind to it. + The right answer is probably "QUIC cert pubkey + a per-handshake + nonce", binding both the identity and the freshness. + +4. **Bootstrap chicken-and-egg.** First peer to come up has nobody + to attest to. How does the cluster bootstrap when *every* peer + needs every other peer's attestation? Two answers: + - Coordinators come up first; admit only if peer's + attestation is valid; coordinators admit each other via a + genesis attestation whose policy is "any peer in our app-id". + - Or: same code, no special-case — just per-app-id from the + start. + +5. **Failure observability.** What's the log shape when admission + fails? Operators need to see "rejected peer X because + app_id=Y didn't match expected=Z" — not just "link failed". + New error type + structured log line. + +6. **Interaction with the planned single-sidecar consolidation + (Gap 2).** Attestation lookup happens inside mesh-conn, so it + stays with mesh-conn whether mesh-conn is a separate container or + one process inside the consolidated sidecar. Gap 2 should land + first; Gap 3 is easier when the platform plumbing lives in one + place. + +## Success criteria + +- [ ] Each peer fetches a valid dstack attestation at startup. +- [ ] Peer-pair handshake includes attest-stream exchange. +- [ ] Verify signature chains to dstack KMS root. +- [ ] Reject peer with mismatched app-id; admit peer with matching + app-id. +- [ ] Log shape clearly distinguishes admission-reject from other + handshake errors. +- [ ] Failover demos (FAILOVER.md) still pass — RTO unchanged within + noise. +- [ ] A new doc, `consul-postgres-ha/ATTESTATION.md`, explains the + threat model + policy + how to inspect attestations on a + running cluster. + +## Risks + mitigations + +| Risk | Mitigation | +|---|---| +| Attestation API not available on the dstack SDK we're using | Verify in the design-discussion phase before writing code. If missing, the right path is "land Gap 2 first, then file an SDK feature request, then revisit". | +| Verification is slow enough to become a handshake bottleneck | Cache valid peer-attestations for the lifetime of the QUIC connection (don't re-verify on each stream). Measure once before deciding mitigation is needed. | +| Per-app-id is too loose for the user's threat model | Document the limitation in `ATTESTATION.md` and ship Phase 2 (Consul-KV policy) as the next iteration. Don't perfect-is-the-enemy-of-good Phase 1. | +| Bootstrap deadlock — every peer waits for every other | Per-app-id avoids this entirely (no shared trust root needed beyond dstack KMS, which every CVM has). Phase 2 needs explicit thought; not a Phase 1 concern. | + +## Hand-off + +Worth at least a design discussion before writing code (the user +flagged this as "a large topic, breakout session"). Specifically: +verify dstack SDK API, confirm per-app-id is the right Phase 1 +policy, decide on (a) ICE-auth vs (b) post-QUIC stream as the +exchange point. Then implementation is a focused ~300-LoC change in +mesh-conn plus a new doc. diff --git a/consul-postgres-ha/mesh-conn/go.mod b/consul-postgres-ha/mesh-conn/go.mod new file mode 100644 index 0000000..b804372 --- /dev/null +++ b/consul-postgres-ha/mesh-conn/go.mod @@ -0,0 +1,26 @@ +module github.com/Dstack-TEE/dstack-examples/consul-postgres-ha/mesh-conn + +go 1.24 + +require ( + github.com/pion/ice/v2 v2.3.25 + github.com/pion/stun v0.6.1 + github.com/quic-go/quic-go v0.59.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/uuid v1.3.1 // indirect + github.com/pion/dtls/v2 v2.2.7 // indirect + github.com/pion/logging v0.2.2 // indirect + github.com/pion/mdns v0.0.12 // indirect + github.com/pion/randutil v0.1.0 // indirect + github.com/pion/transport/v2 v2.2.2 // indirect + github.com/pion/turn/v2 v2.1.3 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/testify v1.11.1 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sys v0.35.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/consul-postgres-ha/mesh-conn/go.sum b/consul-postgres-ha/mesh-conn/go.sum new file mode 100644 index 0000000..9f7898e --- /dev/null +++ b/consul-postgres-ha/mesh-conn/go.sum @@ -0,0 +1,114 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= +github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pion/dtls/v2 v2.2.7 h1:cSUBsETxepsCSFSxC3mc/aDo14qQLMSL+O6IjG28yV8= +github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s= +github.com/pion/ice/v2 v2.3.25 h1:M5rJA07dqhi3nobJIg+uPtcVjFECTrhcR3n0ns8kDZs= +github.com/pion/ice/v2 v2.3.25/go.mod h1:KXJJcZK7E8WzrBEYnV4UtqEZsGeWfHxsNqhVcVvgjxw= +github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= +github.com/pion/logging v0.2.2/go.mod h1:k0/tDVsRCX2Mb2ZEmTqNa7CWsQPc+YYCB7Q+5pahoms= +github.com/pion/mdns v0.0.12 h1:CiMYlY+O0azojWDmxdNr7ADGrnZ+V6Ilfner+6mSVK8= +github.com/pion/mdns v0.0.12/go.mod h1:VExJjv8to/6Wqm1FXK+Ii/Z9tsVk/F5sD/N70cnYFbk= +github.com/pion/randutil v0.1.0 h1:CFG1UdESneORglEsnimhUjf33Rwjubwj6xfiOXBa3mA= +github.com/pion/randutil v0.1.0/go.mod h1:XcJrSMMbbMRhASFVOlj/5hQial/Y8oH/HVo7TBZq+j8= +github.com/pion/stun v0.6.1 h1:8lp6YejULeHBF8NmV8e2787BogQhduZugh5PdhDyyN4= +github.com/pion/stun v0.6.1/go.mod h1:/hO7APkX4hZKu/D0f2lHzNyvdkTGtIy3NDmLR7kSz/8= +github.com/pion/transport/v2 v2.2.1/go.mod h1:cXXWavvCnFF6McHTft3DWS9iic2Mftcz1Aq29pGcU5g= +github.com/pion/transport/v2 v2.2.2 h1:yv+EKSU2dpmInuCebQ1rsBFCYL7p+aV90xIlshSBO+A= +github.com/pion/transport/v2 v2.2.2/go.mod h1:OJg3ojoBJopjEeECq2yJdXH9YVrUJ1uQ++NjXLOUorc= +github.com/pion/transport/v3 v3.0.1 h1:gDTlPJwROfSfz6QfSi0ZmeCSkFcnWWiiR9ES0ouANiM= +github.com/pion/transport/v3 v3.0.1/go.mod h1:UY7kiITrlMv7/IKgd5eTUcaahZx5oUN3l9SzK5f5xE0= +github.com/pion/turn/v2 v2.1.3 h1:pYxTVWG2gpC97opdRc5IGsQ1lJ9O/IlNhkzj7MMrGAA= +github.com/pion/turn/v2 v2.1.3/go.mod h1:huEpByKKHix2/b9kmTAM3YoX6MKP+/D//0ClgUYR2fY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/quic-go/quic-go v0.59.0 h1:OLJkp1Mlm/aS7dpKgTc6cnpynnD2Xg7C1pwL6vy/SAw= +github.com/quic-go/quic-go v0.59.0/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= +go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.8.0/go.mod h1:mRqEX+O9/h5TFCrQhkgjo2yKi0yYA+9ecGkdQoHrywE= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/consul-postgres-ha/mesh-conn/main.go b/consul-postgres-ha/mesh-conn/main.go new file mode 100644 index 0000000..2284408 --- /dev/null +++ b/consul-postgres-ha/mesh-conn/main.go @@ -0,0 +1,1032 @@ +// mesh-conn — userspace UDP port-forwarding agent over pion/ice. +// +// Replaces the earlier TUN-based version. The TUN approach worked but +// gave us a virtual L3 overlay we never really needed: our apps (Consul +// gossip, simple HTTP services) just want a stable peer address they can +// send UDP to. +// +// Naming convention used by the whole cluster: +// each peer declares a list of "identity ports" — one per protocol. +// For a Consul deployment that's typically four: +// index 0 = serf_lan (UDP+TCP), 1 = server-RPC (TCP), +// index 2 = HTTP API (TCP), 3 = gRPC/xDS (TCP) +// +// On every peer's host: +// - the local app binds 127.0.0.1: for protocol i +// - mesh-conn binds 127.0.0.1: for every OTHER peer +// and every protocol i +// - apps reach peer X on protocol i by sending UDP/TCP to +// 127.0.0.1: +// +// All N peer-pair connections multiplex over one pion/ice connection +// per pair, wrapped in QUIC. Each QUIC stream's first three bytes are +// (tag, port-as-uint16-big-endian) where port is the receiver's own +// identity port — the receiver looks it up in self.ports and dispatches +// to the matching local UDP socket / dials the matching local TCP +// service. +// +// Why QUIC and not yamux: yamux assumes a reliable byte-stream underlay, +// but pion/ice.Conn is UDP — and the UDP path between dstack worker +// CVMs is extremely lossy under sustained load (hairpinning the same +// public IP loses ~99% of bulk packets, coturn-relay loses ~78%). +// yamux's keepalive/recv-window invariants then trip and the user- +// visible error is "keepalive timeout" or "recv window exceeded", but +// the root cause is dropped packets. QUIC has built-in loss recovery, +// congestion control, and stream-multiplexing — it's exactly what a +// lossy UDP underlay needs. The previous yamux build died at 3KB-260KB +// depending on path; the QUIC build sustains 25-28 MB/s on the same +// hairpin. + +package main + +import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/hmac" + "crypto/rand" + "crypto/sha1" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/base64" + "encoding/binary" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "math/big" + "net" + "net/http" + "net/url" + "os" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/pion/ice/v2" + "github.com/pion/stun" + "github.com/quic-go/quic-go" +) + +// ============================================================================= +// config +// ============================================================================= + +type Peer struct { + ID string `json:"id"` + Ports []int `json:"ports"` +} + +// hasPort returns the index of port in p.Ports, or -1 if absent. +func (p *Peer) hasPort(port int) int { + for i, q := range p.Ports { + if q == port { + return i + } + } + return -1 +} + +type Config struct { + SelfID string + Peers []Peer + SignalingURL string + TurnHost string + TurnSecret string +} + +func loadConfig() *Config { + // Stage-4 sources of truth, with fallback to stage-1 envs so this + // binary is back-compatible with the older deploy shape: + // + // - SELF identity comes from /run/instance/info.json written by + // the bootstrap-secrets init container (which read it from the + // dstack SDK's Info() call). Falls back to PEER_ID env. + // - TURN_SHARED_SECRET comes from /run/secrets/turn (a hex blob + // written by bootstrap-secrets via getKey()). Falls back to + // the env value if the file isn't present. + // - PEERS_JSON still comes via env — cluster.tf computes it from + // the `replicas` count and re-applies on topology change, + // which propagates to every CVM via Phala's in-place compose + // update path (verified in disk-persistence shakedown). + + cfg := &Config{ + SelfID: readSelfID(), + SignalingURL: strings.TrimRight(mustEnv("SIGNALING_URL"), "/"), + TurnHost: os.Getenv("TURN_HOST"), + TurnSecret: readTurnSecret(), + } + if err := json.Unmarshal([]byte(mustEnv("PEERS_JSON")), &cfg.Peers); err != nil { + log.Fatalf("PEERS_JSON: %v", err) + } + if err := validatePeers(cfg); err != nil { + log.Fatalf("PEERS_JSON: %v", err) + } + return cfg +} + +// readSelfID prefers /run/instance/info.json (stage-4) over PEER_ID env +// (stage-1 compat). The JSON is written by bootstrap-secrets and gives +// us a per-CVM identifier rooted in the platform. +func readSelfID() string { + if b, err := os.ReadFile("/run/instance/info.json"); err == nil { + var info struct { + Role string `json:"role"` + Ordinal int `json:"ordinal"` + } + if jerr := json.Unmarshal(b, &info); jerr == nil && info.Role != "" { + id := fmt.Sprintf("%s-%d", info.Role, info.Ordinal) + log.Printf("self identity from /run/instance/info.json: %s", id) + return id + } + log.Printf("WARN /run/instance/info.json present but unparseable; falling back to PEER_ID env: %v", err) + } + return mustEnv("PEER_ID") +} + +// readTurnSecret resolves the TURN shared secret in priority order: +// +// 1. TURN_SHARED_SECRET env (set when using an external coturn whose +// static-auth-secret was configured out-of-band — e.g. the Vultr +// coordinator path). When this is present it MUST win, because +// the local TEE-derived value won't match what coturn is checking +// against. +// 2. /run/secrets/turn (stage-4 TEE-derived path; matches the +// embedded coordinator's coturn which reads the same file). +// +// Order matters: env beats file so that "use external coturn" can be +// configured purely at the cluster.tf layer. +func readTurnSecret() string { + if v := os.Getenv("TURN_SHARED_SECRET"); v != "" { + log.Printf("turn shared secret loaded from TURN_SHARED_SECRET env (%d bytes)", len(v)) + return v + } + if b, err := os.ReadFile("/run/secrets/turn"); err == nil { + s := strings.TrimSpace(string(b)) + if s != "" { + log.Printf("turn shared secret loaded from /run/secrets/turn (%d bytes)", len(s)) + return s + } + } + return "" +} + +// validatePeers fails fast on any silent mis-configuration that would +// otherwise manifest as confusing runtime failures: collided ports, +// missing self, mismatched port-list lengths, etc. Bound at startup +// because a peer's PEERS_JSON is shared with every other peer's +// configuration and must round-trip identically across the cluster. +func validatePeers(cfg *Config) error { + if len(cfg.Peers) < 2 { + return fmt.Errorf("need at least 2 peers in PEERS_JSON, got %d", len(cfg.Peers)) + } + + seenIDs := map[string]bool{} + allPorts := map[int]string{} // port -> peer.ID owning it (for collision detection) + expectedPortCount := -1 + selfFound := false + + for i, p := range cfg.Peers { + if p.ID == "" { + return fmt.Errorf("peer[%d] has empty id", i) + } + if seenIDs[p.ID] { + return fmt.Errorf("peer id %q appears twice in PEERS_JSON", p.ID) + } + seenIDs[p.ID] = true + if p.ID == cfg.SelfID { + selfFound = true + } + + if len(p.Ports) == 0 { + return fmt.Errorf("peer %q has empty Ports list", p.ID) + } + if expectedPortCount < 0 { + expectedPortCount = len(p.Ports) + } else if len(p.Ports) != expectedPortCount { + return fmt.Errorf("peer %q has %d ports, expected %d (every peer's port-list must have the same length — index i is the same protocol slot across peers)", + p.ID, len(p.Ports), expectedPortCount) + } + + // Each port must be unique cluster-wide: mesh-conn binds OTHER + // peers' ports on 127.0.0.1, so two peers can't share a port + // number or one would shadow the other. + seenSelf := map[int]bool{} + for j, port := range p.Ports { + if port <= 0 || port > 65535 { + return fmt.Errorf("peer %q ports[%d]=%d is out of range", p.ID, j, port) + } + if seenSelf[port] { + return fmt.Errorf("peer %q has duplicate port %d in its own Ports list", p.ID, port) + } + seenSelf[port] = true + if owner, ok := allPorts[port]; ok { + return fmt.Errorf("port %d is claimed by both peer %q and peer %q — every identity port must be globally unique", + port, owner, p.ID) + } + allPorts[port] = p.ID + } + } + + if !selfFound { + return fmt.Errorf("PEER_ID %q not in PEERS_JSON (peers: %v)", cfg.SelfID, knownIDs(cfg.Peers)) + } + + // Log a digest of the validated config so operators can check that + // every peer in the cluster sees the same PEERS_JSON. Differences + // across peers would indicate a deploy-script discrepancy. + digest := peersDigest(cfg.Peers) + log.Printf("PEERS_JSON validated: %d peers, %d ports each, digest=%s", + len(cfg.Peers), expectedPortCount, digest) + return nil +} + +func knownIDs(peers []Peer) []string { + ids := make([]string, 0, len(peers)) + for _, p := range peers { + ids = append(ids, p.ID) + } + return ids +} + +// peersDigest is a short stable hash of the canonical PEERS_JSON used +// only to make config-drift diagnosable across peers' logs. +func peersDigest(peers []Peer) string { + keys := make([]string, len(peers)) + for i, p := range peers { + keys[i] = p.ID + } + // Stable sort by ID so a re-ordered PEERS_JSON gives the same digest. + // Then encode as a deterministic string. + for i := 1; i < len(keys); i++ { + for j := i; j > 0 && keys[j] < keys[j-1]; j-- { + keys[j], keys[j-1] = keys[j-1], keys[j] + } + } + var buf strings.Builder + for _, id := range keys { + buf.WriteString(id) + buf.WriteByte(':') + // find peer + for _, p := range peers { + if p.ID == id { + for _, port := range p.Ports { + fmt.Fprintf(&buf, "%d,", port) + } + break + } + } + buf.WriteByte('|') + } + h := sha1.Sum([]byte(buf.String())) + return base64.RawStdEncoding.EncodeToString(h[:])[:12] +} + +func (c *Config) peerByID(id string) *Peer { + for i := range c.Peers { + if c.Peers[i].ID == id { + return &c.Peers[i] + } + } + return nil +} + +func mustEnv(k string) string { + v := os.Getenv(k) + if v == "" { + log.Fatalf("missing env %s", k) + } + return v +} + +// ============================================================================= +// main +// ============================================================================= + +func main() { + flag.Parse() + cfg := loadConfig() + self := cfg.peerByID(cfg.SelfID) + + others := make([]Peer, 0, len(cfg.Peers)-1) + for _, p := range cfg.Peers { + if p.ID != cfg.SelfID { + others = append(others, p) + } + } + log.Printf("mesh-conn: self=%s ports=%v other=%d", cfg.SelfID, self.Ports, len(others)) + + go pollLoop(cfg) + + var wg sync.WaitGroup + for _, p := range others { + wg.Add(1) + go func(p Peer) { + defer wg.Done() + runPeerLink(cfg, *self, p) + }(p) + } + wg.Wait() + log.Printf("all peer links exited") +} + +// ============================================================================= +// per-peer link: ICE conn + bound UDP socket on peer's identity port +// ============================================================================= + +func runPeerLink(cfg *Config, self, peer Peer) { + for { + if err := dialAndPump(cfg, self, peer); err != nil { + log.Printf("[%s] link failed: %v — retrying in 5s", peer.ID, err) + time.Sleep(5 * time.Second) + continue + } + // dialAndPump returns nil only when the conn closed cleanly. + log.Printf("[%s] link closed — reconnecting", peer.ID) + } +} + +// Stream header layout: 3 bytes per stream open. +// byte 0 = tag (streamUDP or streamTCP) +// bytes 1-2 = receiver-side port (big-endian uint16) — the port number +// the receiver itself binds locally; receiver looks it up in its own +// Ports list to find the index/protocol slot +const ( + streamUDP byte = 0x55 // long-lived per-port UDP datagram pipe + streamTCP byte = 0x33 // per-conn TCP byte-stream forwarder +) + +// quicConfig is shared by client and server. We give QUIC large windows +// so a pg_basebackup stream (sustained 100s of MB) doesn't stall on +// flow-control updates: a single InitialConnectionReceiveWindow of 8 MiB +// lets the sender push a chunk that big before needing an ACK from us. +// MaxIdleTimeout is what we use to detect a dead link — if no packet +// arrives in this long, the conn errors out. +func quicConfig() *quic.Config { + return &quic.Config{ + KeepAlivePeriod: 10 * time.Second, + MaxIdleTimeout: 60 * time.Second, + InitialStreamReceiveWindow: 4 << 20, + MaxStreamReceiveWindow: 16 << 20, + InitialConnectionReceiveWindow: 8 << 20, + MaxConnectionReceiveWindow: 32 << 20, + } +} + +func dialAndPump(cfg *Config, self, peer Peer) error { + if len(self.Ports) != len(peer.Ports) { + return fmt.Errorf("port-count mismatch: self has %d ports, peer has %d", len(self.Ports), len(peer.Ports)) + } + + // 1. Establish ICE + wrap with a counting conn for byte-level telemetry. + rawConn, err := dialICE(cfg, peer.ID) + if err != nil { + return fmt.Errorf("ice: %w", err) + } + defer rawConn.Close() + counted := newCountingConn(rawConn, peer.ID) + pkt := &iceConnPacketConn{conn: counted} + + // 2. Establish a QUIC connection on top of the ICE PacketConn. + // We replaced yamux here because pion/ice.Conn's UDP underlay drops + // packets under sustained load (NAT hairpinning loss between dstack + // workers is ~99%; even relay-via-coturn loses ~78%). yamux assumes + // a reliable byte-stream and dies as "keepalive timeout" or "recv + // window exceeded" — protocol violations triggered by lost packets. + // QUIC has built-in loss recovery + congestion control, so a lossy + // UDP underlay is exactly what it expects. Stream multiplex API is + // a near-drop-in for yamux: OpenStreamSync / AcceptStream. + isClient := cfg.SelfID < peer.ID + connCtx, connCancel := context.WithCancel(context.Background()) + defer connCancel() + dialCtx, dialCancel := context.WithTimeout(connCtx, 30*time.Second) + defer dialCancel() + + var qconn *quic.Conn + if isClient { + // remote net.Addr is ignored by our PacketConn shim (it only + // knows about the one ICE peer); we still pass something non-nil + // because quic.Dial uses it for SNI fallback / connection ID. + qconn, err = quic.Dial(dialCtx, pkt, counted.RemoteAddr(), clientTLS(), quicConfig()) + if err != nil { + return fmt.Errorf("quic dial: %w", err) + } + } else { + ln, lerr := quic.Listen(pkt, serverTLS(), quicConfig()) + if lerr != nil { + return fmt.Errorf("quic listen: %w", lerr) + } + // Close the listener once we have our one accepted conn — we + // only want a single QUIC connection per ICE pair. + acceptCtx, acceptCancel := context.WithTimeout(connCtx, 30*time.Second) + qconn, err = ln.Accept(acceptCtx) + acceptCancel() + ln.Close() + if err != nil { + return fmt.Errorf("quic accept: %w", err) + } + } + defer qconn.CloseWithError(0, "") + + // Periodic per-link telemetry. The counting conn tracks bytes through + // the underlying ice.Conn (i.e. wire bytes including QUIC overhead). + // QUIC's StreamCount isn't directly exposed, so we report just bytes. + stopStats := make(chan struct{}) + go reportLinkStats(peer.ID, counted, stopStats) + defer close(stopStats) + + // 3. Bind localhost UDP+TCP listeners for every one of peer's ports. + udpSocks := make([]*net.UDPConn, len(peer.Ports)) + tcpListeners := make([]*net.TCPListener, len(peer.Ports)) + for i, port := range peer.Ports { + udpSocks[i], err = net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: port}) + if err != nil { + return fmt.Errorf("udp listen 127.0.0.1:%d: %w", port, err) + } + defer udpSocks[i].Close() + tcpListeners[i], err = net.ListenTCP("tcp", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: port}) + if err != nil { + return fmt.Errorf("tcp listen 127.0.0.1:%d: %w", port, err) + } + defer tcpListeners[i].Close() + } + + // 4. Establish the per-port long-lived UDP streams. Client opens + // them eagerly, server's accept loop populates them as headers + // arrive. Both sides also run an accept loop to handle ad-hoc + // incoming TCP streams. + udpStreams := make([]*quic.Stream, len(peer.Ports)) + allUDPReady := make(chan struct{}) + errCh := make(chan error, 4*len(peer.Ports)) + + go func() { + errCh <- runAcceptLoop(connCtx, qconn, &self, &peer, udpStreams, allUDPReady) + }() + + if isClient { + for i, peerPort := range peer.Ports { + s, err := qconn.OpenStreamSync(connCtx) + if err != nil { + return fmt.Errorf("quic OpenStreamSync: %w", err) + } + hdr := []byte{streamUDP, byte(peerPort >> 8), byte(peerPort & 0xff)} + if _, err := s.Write(hdr); err != nil { + return fmt.Errorf("quic write hdr: %w", err) + } + udpStreams[i] = s + } + close(allUDPReady) + } else { + // Server: wait for all UDP streams to register via accept loop. + select { + case <-allUDPReady: + case <-time.After(60 * time.Second): + return fmt.Errorf("timeout waiting for UDP streams") + } + } + + log.Printf("[%s] link up — %d ports forwarded (udp+tcp), peer reachable via ICE", + peer.ID, len(peer.Ports)) + + // 5. Start pumps for each port. + for i := range peer.Ports { + i := i + selfPort := self.Ports[i] + go func() { errCh <- pumpUDPSockToStream(udpSocks[i], udpStreams[i]) }() + go func() { + udpDst := &net.UDPAddr{IP: net.IPv4(127, 0, 0, 1), Port: selfPort} + errCh <- pumpUDPStreamToSock(udpStreams[i], udpSocks[i], udpDst) + }() + go func() { + peerPort := peer.Ports[i] + errCh <- acceptLocalTCP(connCtx, tcpListeners[i], qconn, peerPort) + }() + } + return <-errCh +} + +// runAcceptLoop handles every incoming QUIC stream from the peer. +// streamUDP headers are matched to the right slot in udpStreams (one per +// port, by index in self.Ports). streamTCP triggers a Dial to the +// corresponding local TCP service. +func runAcceptLoop(ctx context.Context, qconn *quic.Conn, self, peer *Peer, udpStreams []*quic.Stream, allUDPReady chan struct{}) error { + udpRegisteredCount := 0 + udpRegisteredOnce := make([]bool, len(self.Ports)) + for { + s, err := qconn.AcceptStream(ctx) + if err != nil { + return fmt.Errorf("quic accept: %w", err) + } + hdr := make([]byte, 3) + if _, err := io.ReadFull(s, hdr); err != nil { + s.CancelRead(0) + s.Close() + continue + } + tag := hdr[0] + port := int(hdr[1])<<8 | int(hdr[2]) + // "port" is the receiver-side port — we look it up in our own ports. + idx := self.hasPort(port) + if idx < 0 { + log.Printf("[%s] stream for unknown self-port %d", peer.ID, port) + s.CancelRead(0) + s.Close() + continue + } + switch tag { + case streamUDP: + udpStreams[idx] = s + if !udpRegisteredOnce[idx] { + udpRegisteredOnce[idx] = true + udpRegisteredCount++ + if udpRegisteredCount == len(self.Ports) { + close(allUDPReady) + } + } + case streamTCP: + go handleIncomingTCP(s, &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: port}) + default: + log.Printf("[%s] unknown stream tag 0x%x", peer.ID, tag) + s.CancelRead(0) + s.Close() + } + } +} + +func handleIncomingTCP(s *quic.Stream, dst *net.TCPAddr) { + defer s.Close() + c, err := net.DialTCP("tcp", nil, dst) + if err != nil { + log.Printf("dial local %s: %v", dst, err) + return + } + defer c.Close() + spliceBoth(s, c) +} + +func acceptLocalTCP(ctx context.Context, lis *net.TCPListener, qconn *quic.Conn, dstPeerPort int) error { + for { + c, err := lis.AcceptTCP() + if err != nil { + return fmt.Errorf("tcp accept: %w", err) + } + go func(c *net.TCPConn) { + defer c.Close() + s, err := qconn.OpenStreamSync(ctx) + if err != nil { + log.Printf("quic open: %v", err) + return + } + defer s.Close() + hdr := []byte{streamTCP, byte(dstPeerPort >> 8), byte(dstPeerPort & 0xff)} + if _, err := s.Write(hdr); err != nil { + return + } + spliceBoth(s, c) + }(c) + } +} + +func spliceBoth(a, b io.ReadWriteCloser) { + done := make(chan struct{}, 2) + go func() { io.Copy(a, b); done <- struct{}{} }() + go func() { io.Copy(b, a); done <- struct{}{} }() + <-done +} + +// ============================================================================= +// UDP-over-yamux: length-prefixed datagrams on the dedicated stream. +// ============================================================================= + +func pumpUDPSockToStream(sock *net.UDPConn, s *quic.Stream) error { + buf := make([]byte, 1500) + frame := make([]byte, 2+1500) + for { + n, _, err := sock.ReadFromUDP(buf) + if err != nil { + return fmt.Errorf("udp sock read: %w", err) + } + if n > 65535 { + continue + } + binary.BigEndian.PutUint16(frame[:2], uint16(n)) + copy(frame[2:], buf[:n]) + if _, err := s.Write(frame[:2+n]); err != nil { + return fmt.Errorf("udp stream write: %w", err) + } + } +} + +func pumpUDPStreamToSock(s *quic.Stream, sock *net.UDPConn, dst *net.UDPAddr) error { + hdr := make([]byte, 2) + buf := make([]byte, 65536) + for { + if _, err := io.ReadFull(s, hdr); err != nil { + return fmt.Errorf("udp stream read header: %w", err) + } + n := int(binary.BigEndian.Uint16(hdr)) + if _, err := io.ReadFull(s, buf[:n]); err != nil { + return fmt.Errorf("udp stream read body: %w", err) + } + if _, err := sock.WriteToUDP(buf[:n], dst); err != nil { + return fmt.Errorf("udp sock write: %w", err) + } + } +} + +// ============================================================================= +// Per-link instrumentation: count bytes through the ICE conn (i.e. the +// raw wire bytes including QUIC framing/encryption overhead) and log +// a periodic summary. Useful for diagnosing whether a link drop happens +// after 0 bytes, 1KB, or 100MB. +// ============================================================================= + +type countingConn struct { + net.Conn + peerID string + bytesIn atomic.Uint64 + bytesOut atomic.Uint64 + reads atomic.Uint64 + writes atomic.Uint64 +} + +func newCountingConn(c net.Conn, peerID string) *countingConn { + return &countingConn{Conn: c, peerID: peerID} +} + +func (c *countingConn) Read(p []byte) (int, error) { + n, err := c.Conn.Read(p) + c.bytesIn.Add(uint64(n)) + c.reads.Add(1) + if err != nil { + log.Printf("[%s] conn.Read err after %d bytes total / %d reads: %v", + c.peerID, c.bytesIn.Load(), c.reads.Load(), err) + } + return n, err +} + +func (c *countingConn) Write(p []byte) (int, error) { + n, err := c.Conn.Write(p) + c.bytesOut.Add(uint64(n)) + c.writes.Add(1) + if err != nil { + log.Printf("[%s] conn.Write err after %d bytes total / %d writes: %v", + c.peerID, c.bytesOut.Load(), c.writes.Load(), err) + } + return n, err +} + +// iceConnPacketConn adapts a pion/ice.Conn (packet-oriented net.Conn) to +// a net.PacketConn so quic-go can run on it. Every Read on ice.Conn +// returns one datagram; every Write sends one. The single-peer case +// means we can ignore the addr arg from quic and unconditionally route +// to the ICE peer that's already locked in. +type iceConnPacketConn struct { + conn *countingConn +} + +func (p *iceConnPacketConn) ReadFrom(b []byte) (int, net.Addr, error) { + n, err := p.conn.Read(b) + return n, p.conn.RemoteAddr(), err +} + +func (p *iceConnPacketConn) WriteTo(b []byte, _ net.Addr) (int, error) { + return p.conn.Write(b) +} + +func (p *iceConnPacketConn) Close() error { return p.conn.Close() } +func (p *iceConnPacketConn) LocalAddr() net.Addr { return p.conn.LocalAddr() } + +// Deadline methods delegate to ice.Conn (via the embedded net.Conn on +// countingConn) instead of being a no-op. quic-go relies on +// SetReadDeadline to interrupt a blocked ReadFrom when its context +// cancels — without this delegation, a quic.Dial whose context times +// out (e.g. because ICE went Failed mid-handshake) would hang forever +// in our shim, and the surrounding runPeerLink retry loop never gets +// to retry. Pion's ice.Conn implements the deadline methods, so this +// is the natural place to wire them through. +func (p *iceConnPacketConn) SetDeadline(t time.Time) error { return p.conn.SetDeadline(t) } +func (p *iceConnPacketConn) SetReadDeadline(t time.Time) error { return p.conn.SetReadDeadline(t) } +func (p *iceConnPacketConn) SetWriteDeadline(t time.Time) error { return p.conn.SetWriteDeadline(t) } + +// reportLinkStats logs a periodic summary per peer link. Once a minute, +// and only when bytes actually moved since the last tick, so an idle +// mesh stays quiet. Always logs the final summary on stop, regardless +// of activity, since that's what postmortems read. +func reportLinkStats(peerID string, conn *countingConn, stop <-chan struct{}) { + t := time.NewTicker(60 * time.Second) + defer t.Stop() + var lastIn, lastOut uint64 + for { + select { + case <-stop: + log.Printf("[%s] final stats: in=%d out=%d reads=%d writes=%d", + peerID, conn.bytesIn.Load(), conn.bytesOut.Load(), + conn.reads.Load(), conn.writes.Load()) + return + case <-t.C: + in, out := conn.bytesIn.Load(), conn.bytesOut.Load() + if in == lastIn && out == lastOut { + continue + } + log.Printf("[%s] link: in=%d (+%d B/min) out=%d (+%d B/min) reads=%d writes=%d", + peerID, in, in-lastIn, out, out-lastOut, + conn.reads.Load(), conn.writes.Load()) + lastIn, lastOut = in, out + } + } +} + +// ============================================================================= +// TLS — QUIC requires a TLS handshake. We don't rely on its identity +// guarantees (mesh peers are already authenticated by the dstack TEE +// layer + the TURN HMAC secret); a self-signed cert with no verification +// is fine here. We accept any peer cert because trust is established +// out-of-band before ICE even starts. +// ============================================================================= + +const quicALPN = "dstack-mesh-conn" + +func clientTLS() *tls.Config { + return &tls.Config{ + InsecureSkipVerify: true, + NextProtos: []string{quicALPN}, + } +} + +func serverTLS() *tls.Config { + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + log.Fatalf("ecdsa keygen: %v", err) + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "mesh-conn"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(365 * 24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &priv.PublicKey, priv) + if err != nil { + log.Fatalf("self-signed cert: %v", err) + } + return &tls.Config{ + Certificates: []tls.Certificate{{ + Certificate: [][]byte{der}, + PrivateKey: priv, + }}, + NextProtos: []string{quicALPN}, + } +} + +// ============================================================================= +// ICE — one agent per peer pair +// ============================================================================= + +// peerSession is the shared state between dialICE (the current attempt +// to handshake) and pollLoop (delivering signalling messages). It is +// replaced wholesale on every reconnect so stale state from a previous +// failed attempt can't poison the next one. +type peerSession struct { + agent *ice.Agent + authCh chan [2]string +} + +var ( + sessionsMu sync.Mutex + sessions = map[string]*peerSession{} // key = remote peer id +) + +// currentSession returns the active session for remoteID, or nil if +// none exists yet. Used by pollLoop to find the right authCh / +// agent for incoming messages. +func currentSession(remoteID string) *peerSession { + sessionsMu.Lock() + defer sessionsMu.Unlock() + return sessions[remoteID] +} + +// installSession atomically replaces any previous session for +// remoteID. Called from dialICE on each new attempt, so any stale +// auth/candidate that pollLoop wrote to the *old* channel is left +// behind unreferenced and the new attempt starts from clean state. +func installSession(remoteID string, agent *ice.Agent) *peerSession { + sessionsMu.Lock() + defer sessionsMu.Unlock() + s := &peerSession{agent: agent, authCh: make(chan [2]string, 1)} + sessions[remoteID] = s + return s +} + +func dialICE(cfg *Config, remoteID string) (*ice.Conn, error) { + var urls []*stun.URI + if cfg.TurnHost != "" { + user, pass := turnCreds(cfg.TurnSecret, time.Hour) + urls = []*stun.URI{ + {Scheme: stun.SchemeTypeSTUN, Host: cfg.TurnHost, Port: 3478, Proto: stun.ProtoTypeUDP}, + {Scheme: stun.SchemeTypeTURN, Host: cfg.TurnHost, Port: 3478, Proto: stun.ProtoTypeUDP, Username: user, Password: pass}, + {Scheme: stun.SchemeTypeTURN, Host: cfg.TurnHost, Port: 3478, Proto: stun.ProtoTypeTCP, Username: user, Password: pass}, + } + } + + // MESH_CONN_RELAY_ONLY=1 restricts candidate gathering to Relay only. + // Use when direct (host/srflx/prflx) connectivity is unreliable — e.g. + // dstack worker-to-worker pairs where pion's connectivity check fails + // for every direct pair and the agent never gets to relay before + // timing out. Trades latency for guaranteed reachability via coturn. + candidateTypes := []ice.CandidateType{ + ice.CandidateTypeHost, + ice.CandidateTypeServerReflexive, + ice.CandidateTypePeerReflexive, + ice.CandidateTypeRelay, + } + if os.Getenv("MESH_CONN_RELAY_ONLY") == "1" { + candidateTypes = []ice.CandidateType{ice.CandidateTypeRelay} + } + agent, err := ice.NewAgent(&ice.AgentConfig{ + Urls: urls, + NetworkTypes: []ice.NetworkType{ice.NetworkTypeUDP4, ice.NetworkTypeTCP4}, + CandidateTypes: candidateTypes, + }) + if err != nil { + return nil, fmt.Errorf("NewAgent: %w", err) + } + // Install fresh session BEFORE doing any signalling so any partner + // auth/candidate we publish only ever resolves against this attempt. + // pollLoop will deliver messages here from now on. + sess := installSession(remoteID, agent) + + // dialCtx is cancelled either by ICE state Failed/Closed (terminal + // pion/ice states; agent.Dial/Accept won't recover from them on its + // own and would otherwise block forever) or by the 60s deadline below. + // runPeerLink retries the whole dialAndPump after we return — without + // the cancel, a single ICE failure wedges this peer slot indefinitely. + dialCtx, cancelDial := context.WithCancel(context.Background()) + defer cancelDial() + + closeAgent := func() { + // pion's Close is idempotent; safe in defers and callbacks both. + _ = agent.Close() + } + + if err := agent.OnCandidate(func(c ice.Candidate) { + if c == nil { + return + } + publish(cfg, remoteID, "candidate", c.Marshal()) + }); err != nil { + closeAgent() + return nil, err + } + if err := agent.OnConnectionStateChange(func(s ice.ConnectionState) { + log.Printf("[%s] ice state: %s", remoteID, s) + if s == ice.ConnectionStateFailed || s == ice.ConnectionStateClosed { + cancelDial() + } + }); err != nil { + closeAgent() + return nil, err + } + + localUfrag, localPwd, err := agent.GetLocalUserCredentials() + if err != nil { + closeAgent() + return nil, err + } + publish(cfg, remoteID, "auth", localUfrag+":"+localPwd) + + if err := agent.GatherCandidates(); err != nil { + closeAgent() + return nil, err + } + + var remote [2]string + select { + case remote = <-sess.authCh: + case <-time.After(60 * time.Second): + closeAgent() + return nil, fmt.Errorf("timeout waiting for remote auth from %s", remoteID) + } + + // 60s is comfortably longer than pion's default 30s connectivity-check + // window. If Dial/Accept hasn't succeeded by then, ICE has already + // transitioned to Failed and the state callback above cancelled the ctx. + dialTimer := time.AfterFunc(60*time.Second, cancelDial) + defer dialTimer.Stop() + + var conn *ice.Conn + if cfg.SelfID < remoteID { + conn, err = agent.Dial(dialCtx, remote[0], remote[1]) + } else { + conn, err = agent.Accept(dialCtx, remote[0], remote[1]) + } + if err != nil { + closeAgent() + return nil, err + } + + if pair, perr := agent.GetSelectedCandidatePair(); perr == nil && pair != nil { + // Log full addresses + types so we can correlate stuck links against + // specific NAT mappings / TURN allocations on coturn. + log.Printf("[%s] selected pair: %s %s:%d <-> %s %s:%d (proto=%s)", + remoteID, + pair.Local.Type(), pair.Local.Address(), pair.Local.Port(), + pair.Remote.Type(), pair.Remote.Address(), pair.Remote.Port(), + pair.Local.NetworkType().NetworkShort()) + } + return conn, nil +} + +func turnCreds(secret string, ttl time.Duration) (string, string) { + exp := time.Now().Add(ttl).Unix() + user := fmt.Sprintf("%d:meshconn", exp) + h := hmac.New(sha1.New, []byte(secret)) + h.Write([]byte(user)) + return user, base64.StdEncoding.EncodeToString(h.Sum(nil)) +} + +// ============================================================================= +// signaling — same wire format as phase-0/icetest +// ============================================================================= + +type Message struct { + From string `json:"from"` + Type string `json:"type"` + Data string `json:"data"` +} + +func publish(cfg *Config, to, typ, data string) { + body, _ := json.Marshal(Message{From: cfg.SelfID, Type: typ, Data: data}) + resp, err := http.Post(cfg.SignalingURL+"/publish?to="+url.QueryEscape(to), + "application/json", strings.NewReader(string(body))) + if err != nil { + log.Printf("publish err: %v", err) + return + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() +} + +func pollLoop(cfg *Config) { + for { + resp, err := http.Get(cfg.SignalingURL + "/poll?peer=" + url.QueryEscape(cfg.SelfID)) + if err != nil { + log.Printf("poll err: %v", err) + time.Sleep(time.Second) + continue + } + var msgs []Message + if err := json.NewDecoder(resp.Body).Decode(&msgs); err != nil { + log.Printf("poll decode: %v", err) + resp.Body.Close() + time.Sleep(time.Second) + continue + } + resp.Body.Close() + for _, m := range msgs { + sess := currentSession(m.From) + if sess == nil { + // No active dialICE attempt for this remote yet; drop. + // On reconnect both sides re-enter dialICE and publish + // fresh auth/candidates, so dropping stale messages from + // before our local attempt is what we want. + continue + } + switch m.Type { + case "auth": + parts := strings.SplitN(m.Data, ":", 2) + if len(parts) != 2 { + log.Printf("[%s] bad auth %q", m.From, m.Data) + continue + } + // Always keep the LATEST auth. select-default would drop + // the new one — and if the buffered one was stale (from + // before the peer's last bounce), dialICE would consume + // that stale auth, Dial against the wrong ufrag, ICE + // would Fail, and we'd repeat forever. Drain-then-push + // ensures the channel always holds the most-recent auth. + select { + case <-sess.authCh: + default: + } + sess.authCh <- [2]string{parts[0], parts[1]} + case "candidate": + if sess.agent == nil { + continue + } + cand, err := ice.UnmarshalCandidate(m.Data) + if err != nil { + log.Printf("[%s] bad candidate: %v", m.From, err) + continue + } + if err := sess.agent.AddRemoteCandidate(cand); err != nil { + log.Printf("[%s] AddRemoteCandidate: %v", m.From, err) + } + } + } + } +} diff --git a/consul-postgres-ha/mesh-conn/validate_test.go b/consul-postgres-ha/mesh-conn/validate_test.go new file mode 100644 index 0000000..07e0a54 --- /dev/null +++ b/consul-postgres-ha/mesh-conn/validate_test.go @@ -0,0 +1,171 @@ +package main + +import ( + "net" + "strings" + "testing" + "time" +) + +func TestValidatePeers_OK(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000, 18100}}, + {ID: "w1", Ports: []int{18001, 18101}}, + }, + } + if err := validatePeers(cfg); err != nil { + t.Fatalf("unexpected: %v", err) + } +} + +func TestValidatePeers_PortCollision(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000, 18100}}, + {ID: "w1", Ports: []int{18000, 18101}}, // 18000 collides with ctrl + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "claimed by both") { + t.Fatalf("want collision error, got %v", err) + } +} + +func TestValidatePeers_MismatchedPortCount(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000, 18100, 18200}}, + {ID: "w1", Ports: []int{18001, 18101}}, // missing one + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "expected 3") { + t.Fatalf("want port-count mismatch, got %v", err) + } +} + +func TestValidatePeers_SelfNotInPeers(t *testing.T) { + cfg := &Config{ + SelfID: "missing", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "w1", Ports: []int{18001}}, + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "not in PEERS_JSON") { + t.Fatalf("want self-missing error, got %v", err) + } +} + +func TestValidatePeers_DuplicateID(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "ctrl", Ports: []int{18001}}, + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "twice") { + t.Fatalf("want duplicate-id error, got %v", err) + } +} + +func TestValidatePeers_EmptyPorts(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "w1", Ports: []int{}}, + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "empty Ports") { + t.Fatalf("want empty-ports error, got %v", err) + } +} + +func TestValidatePeers_PortOutOfRange(t *testing.T) { + cfg := &Config{ + SelfID: "ctrl", + Peers: []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "w1", Ports: []int{0}}, + }, + } + err := validatePeers(cfg) + if err == nil || !strings.Contains(err.Error(), "out of range") { + t.Fatalf("want out-of-range error, got %v", err) + } +} + +func TestValidatePeers_DigestStableUnderReorder(t *testing.T) { + a := []Peer{ + {ID: "ctrl", Ports: []int{18000, 18100}}, + {ID: "w1", Ports: []int{18001, 18101}}, + } + b := []Peer{ + {ID: "w1", Ports: []int{18001, 18101}}, + {ID: "ctrl", Ports: []int{18000, 18100}}, + } + if peersDigest(a) != peersDigest(b) { + t.Fatalf("digest changes with peer order: %s vs %s", peersDigest(a), peersDigest(b)) + } +} + +func TestValidatePeers_DigestDiffersWithDifferentPorts(t *testing.T) { + a := []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "w1", Ports: []int{18001}}, + } + b := []Peer{ + {ID: "ctrl", Ports: []int{18000}}, + {ID: "w1", Ports: []int{18002}}, // different + } + if peersDigest(a) == peersDigest(b) { + t.Fatalf("digest collides for different ports") + } +} + +// iceConnPacketConn must delegate deadline methods to the underlying +// conn so quic-go can interrupt blocked reads on context cancel. +// Returning nil from these methods (the previous behavior) leaves +// quic.Dial hung when ICE goes Failed mid-handshake — the surrounding +// runPeerLink retry loop then never gets to retry. Verified once at +// 2026-05-04 against the live cluster; this test pins the behavior so +// a future refactor doesn't regress. +func TestIceConnPacketConn_DeadlinesPropagate(t *testing.T) { + a, b := net.Pipe() + defer a.Close() + defer b.Close() + + pkt := &iceConnPacketConn{conn: newCountingConn(a, "test")} + + deadline := time.Now().Add(50 * time.Millisecond) + if err := pkt.SetReadDeadline(deadline); err != nil { + t.Fatalf("SetReadDeadline: %v", err) + } + + buf := make([]byte, 100) + start := time.Now() + _, _, err := pkt.ReadFrom(buf) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("ReadFrom returned nil error past the deadline") + } + netErr, ok := err.(net.Error) + if !ok || !netErr.Timeout() { + t.Fatalf("expected timeout net.Error, got %v (%T)", err, err) + } + // Generous bounds: net.Pipe's deadline implementation is precise + // enough that 40-300ms covers test-VM jitter without flakes. + if elapsed < 40*time.Millisecond || elapsed > 300*time.Millisecond { + t.Errorf("ReadFrom returned in %v, expected ~50ms", elapsed) + } +} diff --git a/consul-postgres-ha/mesh-sidecar/Dockerfile b/consul-postgres-ha/mesh-sidecar/Dockerfile new file mode 100644 index 0000000..92d6cb0 --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/Dockerfile @@ -0,0 +1,53 @@ +# Single image containing every platform-plumbing process a +# consul-postgres-ha CVM runs: +# +# bootstrap-secrets one-shot init, derives per-CVM secrets from the +# dstack KMS and writes /run/instance/info.json +# mesh-conn QUIC-on-pion/ICE userspace overlay +# consul agent (server on coordinator CVMs, client on +# worker CVMs) +# envoy Connect mTLS data plane (workers only) +# +# Build context is the parent `consul-postgres-ha/` directory so this +# Dockerfile can COPY both Go sources straight in. CI configures that +# context via .github/workflows/consul-postgres-ha-publish.yml. +# +# The final stage inherits envoyproxy/envoy because envoy is the +# largest binary and the only one not statically linked — its base +# already carries the right glibc + ca-certs. The other binaries are +# CGO_ENABLED=0 Go builds that work on any base. + +FROM golang:1.24-alpine AS bootstrap-build +WORKDIR /src +COPY bootstrap-secrets/go.mod bootstrap-secrets/go.sum ./ +RUN go mod download +COPY bootstrap-secrets/*.go ./ +RUN CGO_ENABLED=0 go build -o /out/bootstrap-secrets . + +FROM golang:1.24-alpine AS mesh-build +WORKDIR /src +COPY mesh-conn/go.mod mesh-conn/go.sum ./ +RUN go mod download +COPY mesh-conn/*.go ./ +RUN CGO_ENABLED=0 go build -o /out/mesh-conn . + +FROM hashicorp/consul:1.19 AS consul-bin + +FROM envoyproxy/envoy:contrib-v1.30-latest + +# tini = correct PID 1 reaping + signal forwarding; +# jq, curl = used by the entrypoint and convenient for `docker exec`-debug. +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl jq tini && \ + rm -rf /var/lib/apt/lists/* + +COPY --from=bootstrap-build /out/bootstrap-secrets /usr/local/bin/bootstrap-secrets +COPY --from=mesh-build /out/mesh-conn /usr/local/bin/mesh-conn +COPY --from=consul-bin /bin/consul /usr/local/bin/consul +COPY mesh-sidecar/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Persistent consul state — workers' KV cache and coordinators' Raft log. +VOLUME ["/consul/data"] + +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/entrypoint.sh"] diff --git a/consul-postgres-ha/mesh-sidecar/README.md b/consul-postgres-ha/mesh-sidecar/README.md new file mode 100644 index 0000000..3cb723e --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/README.md @@ -0,0 +1,67 @@ +# consul-postgres-ha-mesh-sidecar + +The single image that holds every platform-plumbing process a worker or +coordinator CVM runs: + +| Process | Role | +|--------------------|----------------------------------------------------------| +| `bootstrap-secrets` | One-shot init: derives per-CVM secrets from the dstack TEE KMS, claims an ordinal, writes `/run/instance/info.json`. | +| `mesh-conn` | QUIC-on-pion/ICE overlay: forwards Consul gossip + RPC + HTTP ports between peer CVMs over a NAT'd L3 path. | +| `consul` | Server on coordinator CVMs (`-server -bootstrap-expect=N -ui`), client on worker CVMs. Joins via mesh-conn-forwarded loopback ports. | +| `envoy` | Connect-mTLS data plane on workers. Bootstrapped from the local consul agent's xDS once it's reachable. Coordinators don't run it. | + +Replaces what used to be four separate compose services +(`bootstrap-secrets`, `mesh-conn`, `consul`, and the old envoy-only +`sidecar`) plus the legacy `keepalive` placeholder. + +The compose-service name stays `sidecar` (so the per-CVM container +name is `dstack-sidecar-1` regardless of which image it points at); +the *image* is `consul-postgres-ha-mesh-sidecar`. The "mesh-" prefix +is meant to make it obvious that this is the bundle of mesh +plumbing — bootstrap-secrets + mesh-conn + consul + envoy — and not +just an Envoy sidecar. + +## Lifecycle + +`tini → entrypoint.sh` is PID 1. The script: + +1. Runs `bootstrap-secrets` to completion (it's a one-shot — exit 0 + means `/run/instance/info.json` and `/run/secrets/*` are in place). +2. Starts `mesh-conn` in the background. +3. Starts `consul agent` in the background, with `-server` + + `-bootstrap-expect=N` if `ROLE=coordinator`. +4. (Workers only) Polls `consul connect envoy -bootstrap` until the + local consul agent answers, then exec's envoy. +5. `wait -n`s on all background processes — if any one exits, the + container exits with that code, and compose's + `restart: on-failure` brings it back. + +This is "shell init", not s6-overlay. If we hit real-world flap-storms +where one inner process dying often takes the whole container down, the +upgrade path is per-process supervision via s6 — but for phase 1 it +doesn't pay its complexity. + +## Debugging + +```bash +# Log stream for the whole sidecar — every line is prefixed with the +# inner process name ([bootstrap-secrets] / [mesh-conn] / [consul] / +# [envoy] / [init]). +docker logs dstack-sidecar-1 + +# Inspect what's running inside. +docker exec dstack-sidecar-1 ps -ef + +# Talk to the local consul agent (handy for cluster status / KV). +docker exec dstack-sidecar-1 sh -c 'consul members -http-addr=127.0.0.1:$(jq -r .ports.http_api /run/instance/info.json)' + +# Curl the local Patroni REST API or webdemo from inside the sidecar. +docker exec dstack-sidecar-1 sh -c 'curl -s http://127.0.0.1:$(jq -r .ports.patroni_rest /run/instance/info.json)/cluster | jq' +``` + +## Build context + +CI builds this image with `consul-postgres-ha/` as the docker context +(not `consul-postgres-ha/mesh-sidecar/`) so the Dockerfile can `COPY +bootstrap-secrets/` and `COPY mesh-conn/` from sibling directories. +See `.github/workflows/consul-postgres-ha-publish.yml`. diff --git a/consul-postgres-ha/mesh-sidecar/entrypoint.sh b/consul-postgres-ha/mesh-sidecar/entrypoint.sh new file mode 100644 index 0000000..a6eb701 --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/entrypoint.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# PID 1 inside the consolidated dstack-mesh sidecar container. Runs the +# four platform-plumbing processes that used to be four separate compose +# services (bootstrap-secrets, mesh-conn, consul, envoy) inside one +# container. tini wraps this script so signal-forwarding + PID 1 reaping +# behave like other tools expect. +# +# Order is fixed by real dependencies: +# 1. bootstrap-secrets runs to completion — writes /run/secrets/* and +# /run/instance/info.json that everything else reads. +# 2. mesh-conn starts and forwards the Consul gossip + RPC ports to +# peer CVMs over QUIC-on-ICE. +# 3. consul agent starts (server on coordinators, client on workers) +# and joins the cluster via mesh-conn's local-loopback forwards. +# 4. envoy bootstraps from the local consul agent and starts the +# sidecar data plane. Workers only — coordinators don't host +# a Connect-mTLS workload. +# +# Phase-1 supervision policy: any one inner process dying takes the +# whole container down. Compose `restart: on-failure` brings it back +# in ~5s, well inside Patroni's 30s lock TTL — same effective recovery +# behavior as the old four-container layout, where any one of those +# containers crashing also resulted in a single ~5s restart. +# +# Per-process logs are prefixed with `[]` so `docker logs +# dstack-sidecar-1` stays readable. Stderr is merged into stdout so a +# single `docker logs` stream sees everything. + +set -euo pipefail +exec 2>&1 + +prefix() { sed -u "s/^/[$1] /" || cat; } +log() { echo "[init] $*"; } + +ROLE="${ROLE:?ROLE must be set (coordinator|worker)}" +log "starting consolidated sidecar, role=$ROLE" + +# ---- 1. bootstrap-secrets (one-shot, must complete) ---- +log "running bootstrap-secrets" +/usr/local/bin/bootstrap-secrets 2>&1 | prefix bootstrap-secrets +INFO=/run/instance/info.json +[ -s "$INFO" ] || { log "bootstrap-secrets did not write $INFO"; exit 1; } + +# Identity/ports computed by bootstrap-secrets — read once, reuse. +PEER_ID=$(jq -r '.role + "-" + (.ordinal|tostring)' "$INFO") +ORDINAL=$(jq -r '.ordinal' "$INFO") +SERF=$(jq -r '.ports.serf_lan' "$INFO") +RPC=$(jq -r '.ports.server_rpc' "$INFO") +HTTP_PORT=$(jq -r '.ports.http_api' "$INFO") +GRPC_PORT=$(jq -r '.ports.grpc' "$INFO") +log "identity: peer=$PEER_ID ordinal=$ORDINAL serf=$SERF http=$HTTP_PORT" + +# ---- 2. mesh-conn (background, long-running) ---- +log "starting mesh-conn" +/usr/local/bin/mesh-conn 2>&1 | prefix mesh-conn & +MESH=$! + +# ---- 3. consul agent (background, long-running) ---- +# Build -retry-join args from COORDINATOR_SERF_PORTS (comma-separated). +# Workers retry-join every coordinator port (mesh-conn forwards each one +# to its actual coordinator via loopback). Coordinators retry-join every +# coordinator port EXCEPT their own — that's how the server quorum +# gossips itself together. +RETRYJOIN=() +for p in $(echo "${COORDINATOR_SERF_PORTS}" | tr ',' ' '); do + if [ "$ROLE" = "coordinator" ] && [ "$p" = "$SERF" ]; then + continue + fi + RETRYJOIN+=("-retry-join=127.0.0.1:$p") +done + +CONSUL_ARGS=( + agent + -node="$PEER_ID" + -datacenter="${CLUSTER_NAME}" + -bind=127.0.0.1 -advertise=127.0.0.1 -client=127.0.0.1 + -serf-lan-port="$SERF" + -server-port="$RPC" + -http-port="$HTTP_PORT" + -grpc-port="$GRPC_PORT" + -dns-port=-1 + "${RETRYJOIN[@]}" + -data-dir=/consul/data + -hcl='connect { enabled = true }' + -log-level=INFO +) +if [ "$ROLE" = "coordinator" ]; then + CONSUL_ARGS=( + "${CONSUL_ARGS[@]}" + -server + -bootstrap-expect="${BOOTSTRAP_EXPECT}" + -ui + ) +fi + +log "starting consul agent" +/usr/local/bin/consul "${CONSUL_ARGS[@]}" 2>&1 | prefix consul & +CONSUL=$! + +# ---- 4. envoy sidecar (workers only) ---- +ENVOY= +if [ "$ROLE" = "worker" ]; then + ADMIN_PORT=$((19100 + ORDINAL)) + log "starting envoy bootstrap loop (admin=$ADMIN_PORT)" + ( + # Wait for the local consul agent to accept connections, then + # generate the Envoy bootstrap config and exec envoy. The wait + # loop is identical in spirit to the old sidecar/ entrypoint; + # it tolerates the consul process taking a few seconds to listen. + until consul connect envoy \ + -sidecar-for="webdemo-${PEER_ID}" \ + -admin-bind="127.0.0.1:${ADMIN_PORT}" \ + -bootstrap \ + -http-addr="127.0.0.1:${HTTP_PORT}" \ + -grpc-addr="127.0.0.1:${GRPC_PORT}" \ + > /tmp/envoy-bootstrap.json 2>/dev/null; do + echo "waiting for sidecar registration..." + sleep 3 + done + exec envoy -c /tmp/envoy-bootstrap.json -l info + ) 2>&1 | prefix envoy & + ENVOY=$! +fi + +CHILDREN=("$MESH" "$CONSUL") +[ -n "$ENVOY" ] && CHILDREN+=("$ENVOY") + +# Forward SIGTERM/SIGINT to all background pipelines. Each inner +# process is the head of a `cmd | prefix` pipeline; killing the +# pipeline group is enough — sed exits when the upstream closes. +shutdown() { + log "received signal, terminating children" + for c in "${CHILDREN[@]}"; do + kill -TERM "$c" 2>/dev/null || true + done +} +trap shutdown TERM INT + +# Block until ANY child exits; then reap the rest and let compose's +# `restart: on-failure` handle re-bringup. The `|| EXIT=$?` form keeps +# `set -e` from aborting the script when wait sees a non-zero rc — we +# want to fall through and clean up siblings before exiting. +EXIT=0 +wait -n "${CHILDREN[@]}" || EXIT=$? +log "child exited (code=$EXIT) — tearing down sidecar" +for c in "${CHILDREN[@]}"; do + kill -TERM "$c" 2>/dev/null || true +done +wait || true +exit "$EXIT" diff --git a/consul-postgres-ha/patroni/Dockerfile b/consul-postgres-ha/patroni/Dockerfile new file mode 100644 index 0000000..044fb6e --- /dev/null +++ b/consul-postgres-ha/patroni/Dockerfile @@ -0,0 +1,21 @@ +# Patroni + PostgreSQL 16 on Alpine. Small enough (~250MB) to pull +# fast on CVM boot; the official Spilo images are ~1GB. +# +# The entrypoint reads /run/instance/info.json (written by +# bootstrap-secrets) for ordinal + per-protocol ports, then renders +# /etc/patroni.yml and execs `patroni`. +FROM postgres:16-alpine + +RUN apk add --no-cache python3 py3-pip py3-psycopg2 py3-yaml jq tini su-exec + +# python-consul + python-etcd are optional; we only ship the consul DCS. +RUN pip install --no-cache-dir --break-system-packages \ + "patroni[consul]==4.0.4" + +# Patroni needs a writeable home for sockets and pid files. +RUN mkdir -p /var/lib/patroni && chown postgres:postgres /var/lib/patroni + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +ENTRYPOINT ["/sbin/tini", "--", "/usr/local/bin/entrypoint.sh"] diff --git a/consul-postgres-ha/patroni/entrypoint.sh b/consul-postgres-ha/patroni/entrypoint.sh new file mode 100644 index 0000000..ce7a30e --- /dev/null +++ b/consul-postgres-ha/patroni/entrypoint.sh @@ -0,0 +1,112 @@ +#!/bin/sh +# Render patroni.yml from /run/instance/info.json + env, then exec patroni. +# +# All Patroni instances in the cluster: +# - share the same `scope` (CLUSTER_NAME) — that's what makes them a +# single Patroni cluster +# - have a unique `name` (the peer ID, e.g. worker-1) +# - register their postgres + REST addresses against 127.0.0.1 — the +# mesh-conn UDP forwarder maps each peer's per-ordinal ports to the +# real peer's listener, so 127.0.0.1: from any +# peer reaches that peer's postgres. +# +# Replication user/password is derived deterministically from the +# cluster-wide `replication` secret written by bootstrap-secrets. +# Same trick for the superuser. + +set -e + +INFO=/run/instance/info.json +SECRETS=/run/secrets + +if [ ! -f "$INFO" ]; then + echo "FATAL: $INFO not present — bootstrap-secrets did not run" >&2 + exit 1 +fi + +ROLE=$(jq -r '.role' "$INFO") +ORD=$(jq -r '.ordinal' "$INFO") +PEER_ID="${ROLE}-${ORD}" +PG_PORT=$(jq -r '.ports.postgres' "$INFO") +REST_PORT=$(jq -r '.ports.patroni_rest' "$INFO") +CONSUL_PORT=$(jq -r '.ports.http_api' "$INFO") +CLUSTER="${CLUSTER_NAME:?CLUSTER_NAME required}" + +# Read or default the credentials. bootstrap-secrets writes +# /run/secrets/{patroni-superuser,patroni-replication} as raw 32-byte +# hex strings (deterministic per-cluster via getKey()). +SUPERUSER_PW=$(cat "${SECRETS}/patroni-superuser" 2>/dev/null || echo dev-pg-pass) +REPL_PW=$(cat "${SECRETS}/patroni-replication" 2>/dev/null || echo dev-repl-pass) + +DATA_DIR=/var/lib/patroni/pgdata +mkdir -p "$DATA_DIR" +chown -R postgres:postgres "$DATA_DIR" /var/lib/patroni +chmod 700 "$DATA_DIR" + +cat > /etc/patroni.yml < len(buf) { + n = len(buf) + } + // Mutate buf slightly each iteration so the hash is meaningful. + buf[0]++ + if _, err := stream.Write(buf[:n]); err != nil { + log.Fatalf("stream write at %d: %v", written, err) + } + h.Write(buf[:n]) + written += n + } + if err := stream.Close(); err != nil { + log.Fatalf("stream close: %v", err) + } + dur := time.Since(start) + log.Printf("client done: wrote %d B in %s (%.2f MB/s) sha256=%s", + written, dur, float64(written)/dur.Seconds()/(1<<20), + hex.EncodeToString(h.Sum(nil))) + + // Wait for peer to ack via close-with-error. + <-conn.Context().Done() + log.Printf("conn ctx done: %v", context.Cause(conn.Context())) +} + +func runQUICServer(pkt net.PacketConn, peerID string, expectedMB int) { + tlsConf := selfSignedTLS() + tlsConf.NextProtos = []string{"mesh-conn-smoke"} + ln, err := quic.Listen(pkt, tlsConf, &quic.Config{ + KeepAlivePeriod: 5 * time.Second, + MaxIdleTimeout: 60 * time.Second, + InitialStreamReceiveWindow: 4 << 20, + MaxStreamReceiveWindow: 16 << 20, + InitialConnectionReceiveWindow: 8 << 20, + MaxConnectionReceiveWindow: 32 << 20, + }) + if err != nil { + log.Fatalf("quic listen: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + conn, err := ln.Accept(ctx) + if err != nil { + log.Fatalf("accept: %v", err) + } + log.Printf("quic accepted peer %s", peerID) + + stream, err := conn.AcceptStream(ctx) + if err != nil { + log.Fatalf("accept stream: %v", err) + } + log.Printf("stream %d accepted", stream.StreamID()) + + h := sha256.New() + start := time.Now() + n, err := io.Copy(h, stream) + dur := time.Since(start) + if err != nil { + log.Fatalf("read: copied %d before err: %v", n, err) + } + log.Printf("server done: read %d B in %s (%.2f MB/s) sha256=%s", + n, dur, float64(n)/dur.Seconds()/(1<<20), + hex.EncodeToString(h.Sum(nil))) + + expected := int64(expectedMB) * 1024 * 1024 + if n != expected { + log.Fatalf("byte count mismatch: got %d want %d", n, expected) + } + conn.CloseWithError(0, "ok") +} + +func selfSignedTLS() *tls.Config { + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + log.Fatalf("genkey: %v", err) + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "mesh-conn-smoke"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &priv.PublicKey, priv) + if err != nil { + log.Fatalf("createcert: %v", err) + } + return &tls.Config{ + Certificates: []tls.Certificate{{ + Certificate: [][]byte{der}, + PrivateKey: priv, + }}, + } +} + +// ============================================================================= +// minimal ICE dial — copy of mesh-conn's signalling shape +// ============================================================================= + +type peerSession struct { + agent *ice.Agent + authCh chan [2]string +} + +var ( + sessionsMu sync.Mutex + sessions = map[string]*peerSession{} +) + +func dialICE(self, remote, signalURL, turnHost, turnSecret string, relayOnly bool) (*ice.Conn, error) { + var urls []*stun.URI + if turnHost != "" { + user, pass := turnCreds(turnSecret, time.Hour) + urls = []*stun.URI{ + {Scheme: stun.SchemeTypeSTUN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeUDP}, + {Scheme: stun.SchemeTypeTURN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeUDP, Username: user, Password: pass}, + {Scheme: stun.SchemeTypeTURN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeTCP, Username: user, Password: pass}, + } + } + candTypes := []ice.CandidateType{ + ice.CandidateTypeHost, + ice.CandidateTypeServerReflexive, + ice.CandidateTypePeerReflexive, + ice.CandidateTypeRelay, + } + if relayOnly { + candTypes = []ice.CandidateType{ice.CandidateTypeRelay} + } + agent, err := ice.NewAgent(&ice.AgentConfig{ + Urls: urls, + NetworkTypes: []ice.NetworkType{ice.NetworkTypeUDP4, ice.NetworkTypeTCP4}, + CandidateTypes: candTypes, + }) + if err != nil { + return nil, err + } + sess := &peerSession{agent: agent, authCh: make(chan [2]string, 1)} + sessionsMu.Lock() + sessions[remote] = sess + sessionsMu.Unlock() + + go pollLoop(self, signalURL) + + if err := agent.OnCandidate(func(c ice.Candidate) { + if c == nil { + return + } + publish(signalURL, self, remote, "candidate", c.Marshal()) + }); err != nil { + return nil, err + } + if err := agent.OnConnectionStateChange(func(s ice.ConnectionState) { + log.Printf("ice state: %s", s) + }); err != nil { + return nil, err + } + uf, pwd, err := agent.GetLocalUserCredentials() + if err != nil { + return nil, err + } + publish(signalURL, self, remote, "auth", uf+":"+pwd) + if err := agent.GatherCandidates(); err != nil { + return nil, err + } + + var rauth [2]string + select { + case rauth = <-sess.authCh: + case <-time.After(60 * time.Second): + return nil, fmt.Errorf("timeout waiting for remote auth") + } + + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + if self < remote { + return agent.Dial(ctx, rauth[0], rauth[1]) + } + return agent.Accept(ctx, rauth[0], rauth[1]) +} + +type Message struct { + From string `json:"from"` + Type string `json:"type"` + Data string `json:"data"` +} + +func publish(signalURL, from, to, typ, data string) { + body, _ := json.Marshal(Message{From: from, Type: typ, Data: data}) + resp, err := http.Post(signalURL+"/publish?to="+url.QueryEscape(to), + "application/json", strings.NewReader(string(body))) + if err != nil { + log.Printf("publish: %v", err) + return + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() +} + +func pollLoop(self, signalURL string) { + for { + resp, err := http.Get(signalURL + "/poll?peer=" + url.QueryEscape(self)) + if err != nil { + time.Sleep(time.Second) + continue + } + var msgs []Message + json.NewDecoder(resp.Body).Decode(&msgs) + resp.Body.Close() + for _, m := range msgs { + sessionsMu.Lock() + sess := sessions[m.From] + sessionsMu.Unlock() + if sess == nil { + continue + } + switch m.Type { + case "auth": + p := strings.SplitN(m.Data, ":", 2) + if len(p) != 2 { + continue + } + select { + case <-sess.authCh: + default: + } + sess.authCh <- [2]string{p[0], p[1]} + case "candidate": + cand, err := ice.UnmarshalCandidate(m.Data) + if err != nil { + continue + } + sess.agent.AddRemoteCandidate(cand) + } + } + } +} + +func turnCreds(secret string, ttl time.Duration) (string, string) { + exp := time.Now().Add(ttl).Unix() + user := fmt.Sprintf("%d:smoke", exp) + h := hmac.New(sha1.New, []byte(secret)) + h.Write([]byte(user)) + return user, base64.StdEncoding.EncodeToString(h.Sum(nil)) +} diff --git a/consul-postgres-ha/signaling/Dockerfile b/consul-postgres-ha/signaling/Dockerfile new file mode 100644 index 0000000..d8e2b55 --- /dev/null +++ b/consul-postgres-ha/signaling/Dockerfile @@ -0,0 +1,11 @@ +FROM golang:1.22-alpine AS build +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download +COPY *.go ./ +RUN CGO_ENABLED=0 go build -o /out/icetest . + +FROM alpine:3.19 +RUN apk add --no-cache ca-certificates +COPY --from=build /out/icetest /usr/local/bin/icetest +ENTRYPOINT ["/usr/local/bin/icetest"] diff --git a/consul-postgres-ha/signaling/go.mod b/consul-postgres-ha/signaling/go.mod new file mode 100644 index 0000000..64c9fc8 --- /dev/null +++ b/consul-postgres-ha/signaling/go.mod @@ -0,0 +1,25 @@ +module github.com/Dstack-TEE/dstack-examples/consul-postgres-ha/signaling + +go 1.22 + +require ( + github.com/pion/ice/v2 v2.3.25 + github.com/pion/stun v0.6.1 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/uuid v1.3.1 // indirect + github.com/pion/dtls/v2 v2.2.7 // indirect + github.com/pion/logging v0.2.2 // indirect + github.com/pion/mdns v0.0.12 // indirect + github.com/pion/randutil v0.1.0 // indirect + github.com/pion/transport/v2 v2.2.2 // indirect + github.com/pion/turn/v2 v2.1.3 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/testify v1.8.4 // indirect + golang.org/x/crypto v0.18.0 // indirect + golang.org/x/net v0.20.0 // indirect + golang.org/x/sys v0.16.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/consul-postgres-ha/signaling/go.sum b/consul-postgres-ha/signaling/go.sum new file mode 100644 index 0000000..04ed3d5 --- /dev/null +++ b/consul-postgres-ha/signaling/go.sum @@ -0,0 +1,102 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= +github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pion/dtls/v2 v2.2.7 h1:cSUBsETxepsCSFSxC3mc/aDo14qQLMSL+O6IjG28yV8= +github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s= +github.com/pion/ice/v2 v2.3.25 h1:M5rJA07dqhi3nobJIg+uPtcVjFECTrhcR3n0ns8kDZs= +github.com/pion/ice/v2 v2.3.25/go.mod h1:KXJJcZK7E8WzrBEYnV4UtqEZsGeWfHxsNqhVcVvgjxw= +github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= +github.com/pion/logging v0.2.2/go.mod h1:k0/tDVsRCX2Mb2ZEmTqNa7CWsQPc+YYCB7Q+5pahoms= +github.com/pion/mdns v0.0.12 h1:CiMYlY+O0azojWDmxdNr7ADGrnZ+V6Ilfner+6mSVK8= +github.com/pion/mdns v0.0.12/go.mod h1:VExJjv8to/6Wqm1FXK+Ii/Z9tsVk/F5sD/N70cnYFbk= +github.com/pion/randutil v0.1.0 h1:CFG1UdESneORglEsnimhUjf33Rwjubwj6xfiOXBa3mA= +github.com/pion/randutil v0.1.0/go.mod h1:XcJrSMMbbMRhASFVOlj/5hQial/Y8oH/HVo7TBZq+j8= +github.com/pion/stun v0.6.1 h1:8lp6YejULeHBF8NmV8e2787BogQhduZugh5PdhDyyN4= +github.com/pion/stun v0.6.1/go.mod h1:/hO7APkX4hZKu/D0f2lHzNyvdkTGtIy3NDmLR7kSz/8= +github.com/pion/transport/v2 v2.2.1/go.mod h1:cXXWavvCnFF6McHTft3DWS9iic2Mftcz1Aq29pGcU5g= +github.com/pion/transport/v2 v2.2.2 h1:yv+EKSU2dpmInuCebQ1rsBFCYL7p+aV90xIlshSBO+A= +github.com/pion/transport/v2 v2.2.2/go.mod h1:OJg3ojoBJopjEeECq2yJdXH9YVrUJ1uQ++NjXLOUorc= +github.com/pion/transport/v3 v3.0.1 h1:gDTlPJwROfSfz6QfSi0ZmeCSkFcnWWiiR9ES0ouANiM= +github.com/pion/transport/v3 v3.0.1/go.mod h1:UY7kiITrlMv7/IKgd5eTUcaahZx5oUN3l9SzK5f5xE0= +github.com/pion/turn/v2 v2.1.3 h1:pYxTVWG2gpC97opdRc5IGsQ1lJ9O/IlNhkzj7MMrGAA= +github.com/pion/turn/v2 v2.1.3/go.mod h1:huEpByKKHix2/b9kmTAM3YoX6MKP+/D//0ClgUYR2fY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.8.0/go.mod h1:mRqEX+O9/h5TFCrQhkgjo2yKi0yYA+9ecGkdQoHrywE= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= +golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/consul-postgres-ha/signaling/main.go b/consul-postgres-ha/signaling/main.go new file mode 100644 index 0000000..9575e68 --- /dev/null +++ b/consul-postgres-ha/signaling/main.go @@ -0,0 +1,409 @@ +// Phase-0 ICE feasibility test. +// +// Single binary with two modes: +// - signaling: tiny HTTP broker that ferries ICE candidates + ufrag/pwd +// between two peers. Runs on the public coturn host. +// - peer: runs a pion/ice agent against coturn (STUN+TURN), exchanges +// candidates via signaling, establishes connectivity, sends echo +// packets, and prints which candidate pair won + RTT samples. +// +// The point: confirm whether a dstack CVM can hole-punch UDP to another +// dstack CVM (best case: srflx<->srflx), or whether ICE is forced onto +// the relay path (TURN) by dstack's network model. + +package main + +import ( + "context" + "crypto/hmac" + "crypto/sha1" + "encoding/base64" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + "github.com/pion/ice/v2" + "github.com/pion/stun" +) + +func main() { + mode := flag.String("mode", "", "signaling | peer") + addr := flag.String("addr", ":7000", "signaling listen address") + flag.Parse() + + switch *mode { + case "signaling": + runSignaling(*addr) + case "peer": + runPeer() + default: + log.Fatalf("usage: %s -mode=signaling|peer", os.Args[0]) + } +} + +// ============================================================================= +// signaling: HTTP broker +// +// POST /publish?to= body=Message -> queue for recipient +// GET /poll?peer= -> long-poll, returns up to N +// messages, drains the queue +// ============================================================================= + +type Message struct { + From string `json:"from"` + Type string `json:"type"` // "auth" | "candidate" | "done" + Data string `json:"data"` +} + +type mailbox struct { + mu sync.Mutex + queues map[string][]Message + waiters map[string]chan struct{} +} + +func newMailbox() *mailbox { + return &mailbox{ + queues: make(map[string][]Message), + waiters: make(map[string]chan struct{}), + } +} + +func (m *mailbox) push(to string, msg Message) { + m.mu.Lock() + defer m.mu.Unlock() + // A new auth from a sender invalidates everything that sender + // previously published in this recipient's queue. Without this + // drop, the recipient would consume stale candidates from the + // sender's prior ICE attempt — pion's connectivity check would + // then dial against addresses whose UDP sockets are gone, ICE + // would Fail, both sides would retry, and the retry would + // consume another stale message from the same backlog. + // + // Because mesh-conn always publishes auth BEFORE its candidates + // (auth comes from agent.GetLocalUserCredentials, candidates + // from agent.OnCandidate after GatherCandidates), an auth + // arriving here marks the start of a fresh epoch from that + // sender, and any stale messages in queue can be safely dropped. + if msg.Type == "auth" { + filtered := m.queues[to][:0] + for _, prev := range m.queues[to] { + if prev.From != msg.From { + filtered = append(filtered, prev) + } + } + m.queues[to] = filtered + } + m.queues[to] = append(m.queues[to], msg) + if w, ok := m.waiters[to]; ok { + close(w) + delete(m.waiters, to) + } +} + +func (m *mailbox) drain(peer string) []Message { + m.mu.Lock() + defer m.mu.Unlock() + q := m.queues[peer] + delete(m.queues, peer) + return q +} + +func (m *mailbox) wait(peer string) <-chan struct{} { + m.mu.Lock() + defer m.mu.Unlock() + if len(m.queues[peer]) > 0 { + c := make(chan struct{}) + close(c) + return c + } + if w, ok := m.waiters[peer]; ok { + return w + } + w := make(chan struct{}) + m.waiters[peer] = w + return w +} + +func runSignaling(addr string) { + mb := newMailbox() + + http.HandleFunc("/publish", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "POST only", http.StatusMethodNotAllowed) + return + } + to := r.URL.Query().Get("to") + if to == "" { + http.Error(w, "missing ?to=", http.StatusBadRequest) + return + } + var msg Message + if err := json.NewDecoder(r.Body).Decode(&msg); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + mb.push(to, msg) + log.Printf("signaling: %s -> %s : %s", msg.From, to, msg.Type) + w.WriteHeader(http.StatusNoContent) + }) + + http.HandleFunc("/poll", func(w http.ResponseWriter, r *http.Request) { + peer := r.URL.Query().Get("peer") + if peer == "" { + http.Error(w, "missing ?peer=", http.StatusBadRequest) + return + } + select { + case <-mb.wait(peer): + case <-time.After(25 * time.Second): + case <-r.Context().Done(): + return + } + msgs := mb.drain(peer) + _ = json.NewEncoder(w).Encode(msgs) + }) + + log.Printf("signaling listening on %s", addr) + if err := http.ListenAndServe(addr, nil); err != nil { + log.Fatal(err) + } +} + +// ============================================================================= +// peer: pion/ice agent +// ============================================================================= + +func runPeer() { + peerID := mustEnv("PEER_ID") + partnerID := mustEnv("PARTNER_ID") + signalingURL := strings.TrimRight(mustEnv("SIGNALING_URL"), "/") + turnHost := os.Getenv("TURN_HOST") + turnSecret := os.Getenv("TURN_SHARED_SECRET") + + var urls []*stun.URI + if turnHost != "" { + if turnSecret == "" { + log.Fatalf("TURN_HOST set but TURN_SHARED_SECRET missing") + } + turnUser, turnPass := makeTurnCreds(turnSecret, 1*time.Hour) + urls = []*stun.URI{ + {Scheme: stun.SchemeTypeSTUN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeUDP}, + {Scheme: stun.SchemeTypeTURN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeUDP, + Username: turnUser, Password: turnPass}, + {Scheme: stun.SchemeTypeTURN, Host: turnHost, Port: 3478, Proto: stun.ProtoTypeTCP, + Username: turnUser, Password: turnPass}, + } + log.Printf("ICE: using STUN+TURN at %s", turnHost) + } else { + log.Printf("ICE: host-candidates only (no TURN_HOST set)") + } + + agent, err := ice.NewAgent(&ice.AgentConfig{ + Urls: urls, + NetworkTypes: []ice.NetworkType{ice.NetworkTypeUDP4, ice.NetworkTypeTCP4}, + CandidateTypes: []ice.CandidateType{ + ice.CandidateTypeHost, + ice.CandidateTypeServerReflexive, + ice.CandidateTypePeerReflexive, + ice.CandidateTypeRelay, + }, + }) + if err != nil { + log.Fatalf("ice.NewAgent: %v", err) + } + + // Send each locally-gathered candidate to the partner. + if err := agent.OnCandidate(func(c ice.Candidate) { + if c == nil { + log.Printf("local: gathering complete") + return + } + log.Printf("local candidate: %s (%s)", c.String(), c.Type()) + publish(signalingURL, peerID, partnerID, "candidate", c.Marshal()) + }); err != nil { + log.Fatalf("OnCandidate: %v", err) + } + + if err := agent.OnConnectionStateChange(func(s ice.ConnectionState) { + log.Printf("ice state: %s", s) + }); err != nil { + log.Fatalf("OnConnectionStateChange: %v", err) + } + + localUfrag, localPwd, err := agent.GetLocalUserCredentials() + if err != nil { + log.Fatalf("GetLocalUserCredentials: %v", err) + } + + publish(signalingURL, peerID, partnerID, "auth", localUfrag+":"+localPwd) + + if err := agent.GatherCandidates(); err != nil { + log.Fatalf("GatherCandidates: %v", err) + } + + authCh := make(chan [2]string, 1) + go pollLoop(signalingURL, peerID, agent, authCh) + + var remote [2]string + select { + case remote = <-authCh: + case <-time.After(60 * time.Second): + log.Fatalf("timed out waiting for partner auth") + } + log.Printf("got remote auth from %s", partnerID) + + // No timeout — wait indefinitely for the partner. Each CVM may boot far + // out of sync with the other (image pull, KMS init, etc.). Container + // restart policy handles process-level failure. + ctx := context.Background() + + var conn *ice.Conn + // Lexicographically smaller peer-id is the controlling side (Dial). + if peerID < partnerID { + log.Printf("role: controlling (Dial)") + conn, err = agent.Dial(ctx, remote[0], remote[1]) + } else { + log.Printf("role: controlled (Accept)") + conn, err = agent.Accept(ctx, remote[0], remote[1]) + } + if err != nil { + log.Fatalf("ice connect: %v", err) + } + + pair, err := agent.GetSelectedCandidatePair() + if err != nil { + log.Fatalf("GetSelectedCandidatePair: %v", err) + } + log.Printf("==========================================================") + log.Printf("CONNECTED via %s <-> %s", pair.Local.Type(), pair.Remote.Type()) + log.Printf(" local : %s", pair.Local.String()) + log.Printf(" remote: %s", pair.Remote.String()) + log.Printf("==========================================================") + + if peerID < partnerID { + runEchoSender(conn) + } else { + runEchoResponder(conn) + } +} + +func runEchoSender(conn *ice.Conn) { + buf := make([]byte, 1500) + for i := 0; i < 20; i++ { + t := time.Now() + payload := fmt.Sprintf("ping-%d", i) + if _, err := conn.Write([]byte(payload)); err != nil { + log.Fatalf("write: %v", err) + } + conn.SetReadDeadline(time.Now().Add(5 * time.Second)) + n, err := conn.Read(buf) + if err != nil { + log.Printf("read err: %v", err) + break + } + log.Printf("rtt=%v reply=%q", time.Since(t), string(buf[:n])) + time.Sleep(200 * time.Millisecond) + } + log.Printf("done") +} + +func runEchoResponder(conn *ice.Conn) { + buf := make([]byte, 1500) + for { + n, err := conn.Read(buf) + if err != nil { + log.Printf("read err: %v", err) + return + } + if _, err := conn.Write(buf[:n]); err != nil { + log.Printf("write err: %v", err) + return + } + } +} + +// ============================================================================= +// helpers +// ============================================================================= + +func pollLoop(signalingURL, peerID string, agent *ice.Agent, authCh chan<- [2]string) { + authSent := false + for { + resp, err := http.Get(signalingURL + "/poll?peer=" + url.QueryEscape(peerID)) + if err != nil { + log.Printf("poll err: %v", err) + time.Sleep(time.Second) + continue + } + var msgs []Message + if err := json.NewDecoder(resp.Body).Decode(&msgs); err != nil { + log.Printf("poll decode: %v", err) + resp.Body.Close() + time.Sleep(time.Second) + continue + } + resp.Body.Close() + for _, m := range msgs { + switch m.Type { + case "auth": + if authSent { + continue + } + parts := strings.SplitN(m.Data, ":", 2) + if len(parts) != 2 { + log.Printf("bad auth: %q", m.Data) + continue + } + authCh <- [2]string{parts[0], parts[1]} + authSent = true + case "candidate": + cand, err := ice.UnmarshalCandidate(m.Data) + if err != nil { + log.Printf("bad candidate %q: %v", m.Data, err) + continue + } + log.Printf("remote candidate: %s (%s)", cand.String(), cand.Type()) + if err := agent.AddRemoteCandidate(cand); err != nil { + log.Printf("AddRemoteCandidate: %v", err) + } + } + } + } +} + +func publish(signalingURL, from, to, typ, data string) { + body, _ := json.Marshal(Message{From: from, Type: typ, Data: data}) + resp, err := http.Post(signalingURL+"/publish?to="+url.QueryEscape(to), + "application/json", strings.NewReader(string(body))) + if err != nil { + log.Printf("publish err: %v", err) + return + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() +} + +func makeTurnCreds(secret string, ttl time.Duration) (string, string) { + exp := time.Now().Add(ttl).Unix() + user := fmt.Sprintf("%d:phase0", exp) + h := hmac.New(sha1.New, []byte(secret)) + h.Write([]byte(user)) + pass := base64.StdEncoding.EncodeToString(h.Sum(nil)) + return user, pass +} + +func mustEnv(k string) string { + v := os.Getenv(k) + if v == "" { + log.Fatalf("missing env %s", k) + } + return v +} diff --git a/consul-postgres-ha/webdemo/Dockerfile b/consul-postgres-ha/webdemo/Dockerfile new file mode 100644 index 0000000..9ab9fc9 --- /dev/null +++ b/consul-postgres-ha/webdemo/Dockerfile @@ -0,0 +1,10 @@ +FROM golang:1.22-alpine AS build +WORKDIR /src +COPY go.mod ./ +COPY *.go ./ +RUN CGO_ENABLED=0 go build -o /out/webdemo . + +FROM alpine:3.19 +RUN apk add --no-cache ca-certificates jq +COPY --from=build /out/webdemo /usr/local/bin/webdemo +ENTRYPOINT ["/usr/local/bin/webdemo"] diff --git a/consul-postgres-ha/webdemo/go.mod b/consul-postgres-ha/webdemo/go.mod new file mode 100644 index 0000000..c86a2af --- /dev/null +++ b/consul-postgres-ha/webdemo/go.mod @@ -0,0 +1,3 @@ +module github.com/Dstack-TEE/dstack-examples/consul-postgres-ha/webdemo + +go 1.22 diff --git a/consul-postgres-ha/webdemo/main.go b/consul-postgres-ha/webdemo/main.go new file mode 100644 index 0000000..7d6a187 --- /dev/null +++ b/consul-postgres-ha/webdemo/main.go @@ -0,0 +1,165 @@ +// webdemo (stage 3b) — same as stage3a, but registered with a Connect +// sidecar so cross-peer traffic flows through Envoy + mTLS. +// +// Differences from stage 3a: +// - service registration body includes a `Connect.SidecarService` +// stanza that Consul uses to spin up a sidecar definition. The +// sidecar's public listener binds the per-peer "sidecar port" +// (env SIDECAR_PORT), and the sidecar exposes one upstream +// ("webdemo") on local port 19000. +// - /all calls the upstream port directly (127.0.0.1:19000/hello). +// Each request goes app -> local sidecar -> mTLS over the overlay +// -> remote sidecar -> remote webdemo. +// - to fan out across all peers we hit /all multiple times so +// Envoy's load-balancer rotates through the instances. +package main + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + "time" +) + +func main() { + name := mustEnv("PEER_ID") + port := mustEnv("WEBDEMO_PORT") + consulAddr := mustEnv("CONSUL_HTTP_ADDR") + sidecarPort := mustEnv("SIDECAR_PORT") + upstreamPort := envOr("UPSTREAM_PORT", "19000") + fanoutN := envOr("FANOUT_N", "8") + + go registerForever(consulAddr, name, port, sidecarPort) + + http.HandleFunc("/hello", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "hello from %s\n", name) + }) + http.HandleFunc("/all", func(w http.ResponseWriter, r *http.Request) { + // Hit the local sidecar's upstream a few times; Envoy rotates + // across instances, so with enough samples we should reach all + // of them at least once. + var n int + fmt.Sscanf(fanoutN, "%d", &n) + results := fanOut(upstreamPort, n) + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]any{ + "from": name, + "samples": n, + "results": results, + }) + }) + + addr := "127.0.0.1:" + port + log.Printf("webdemo: peer=%s listening on %s, consul=%s, sidecar=%s, upstream=%s", + name, addr, consulAddr, sidecarPort, upstreamPort) + log.Fatal(http.ListenAndServe(addr, nil)) +} + +// ============================================================================= +// Connect-aware registration +// ============================================================================= + +func registerForever(consulAddr, name, port, sidecarPort string) { + body := fmt.Sprintf(`{ + "Name": "webdemo", + "ID": "webdemo-%s", + "Address": "127.0.0.1", + "Port": %s, + "Tags": ["peer=%s"], + "Check": { + "HTTP": "http://127.0.0.1:%s/hello", + "Interval": "10s", + "Timeout": "2s", + "DeregisterCriticalServiceAfter": "1m" + }, + "Connect": { + "SidecarService": { + "Port": %s, + "Proxy": { + "LocalServiceAddress": "127.0.0.1", + "LocalServicePort": %s, + "Upstreams": [ + { + "DestinationName": "webdemo", + "LocalBindAddress": "127.0.0.1", + "LocalBindPort": 19000 + } + ] + } + } + } + }`, name, port, name, port, sidecarPort, port) + + for { + req, _ := http.NewRequest("PUT", + "http://"+consulAddr+"/v1/agent/service/register", + strings.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) + if err == nil && resp.StatusCode < 300 { + resp.Body.Close() + log.Printf("registered with consul (peer=%s, port=%s, sidecarPort=%s)", + name, port, sidecarPort) + return + } + if resp != nil { + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + log.Printf("register failed (status=%d): %s", resp.StatusCode, b) + } else { + log.Printf("register err: %v", err) + } + time.Sleep(2 * time.Second) + } +} + +// ============================================================================= +// Fan-out via local sidecar upstream port +// ============================================================================= + +func fanOut(upstreamPort string, n int) map[string]int { + results := make(map[string]int) + var mu sync.Mutex + var wg sync.WaitGroup + for i := 0; i < n; i++ { + wg.Add(1) + go func() { + defer wg.Done() + client := &http.Client{Timeout: 3 * time.Second} + resp, err := client.Get("http://127.0.0.1:" + upstreamPort + "/hello") + body := "" + if err != nil { + body = "error: " + err.Error() + } else { + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + body = strings.TrimSpace(string(b)) + } + mu.Lock() + results[body]++ + mu.Unlock() + }() + } + wg.Wait() + return results +} + +func mustEnv(k string) string { + v := os.Getenv(k) + if v == "" { + log.Fatalf("missing env %s", k) + } + return v +} + +func envOr(k, def string) string { + if v := os.Getenv(k); v != "" { + return v + } + return def +}