diff --git a/.github/workflows/consul-postgres-ha-publish.yml b/.github/workflows/consul-postgres-ha-publish.yml index b633cef..0740512 100644 --- a/.github/workflows/consul-postgres-ha-publish.yml +++ b/.github/workflows/consul-postgres-ha-publish.yml @@ -1,42 +1,49 @@ name: Publish consul-postgres-ha images -# Builds and publishes the six container images the consul-postgres-ha -# example needs (mesh-conn, bootstrap-secrets, signaling, webdemo, -# sidecar, patroni). On push to main, images are -# tagged with the commit SHA *and* `latest`, pushed to GHCR, and -# attested with Sigstore-backed GitHub Build Provenance so consumers -# can verify "this image came from this commit of this repo" without -# us managing any keys. PRs build to verify but do not push or attest. +# Builds and publishes the four container images the consul-postgres-ha +# example needs (mesh-sidecar, patroni, webdemo, signaling). On push +# to main, images are tagged with the commit SHA *and* `latest`, +# pushed to GHCR, and attested with Sigstore-backed GitHub Build +# Provenance so consumers can verify "this image came from this +# commit of this repo" without us managing any keys. PRs build to +# verify but do not push or attest. # -# Why six images on one workflow: the example needs all of them in -# lockstep — bumping mesh-conn alone but leaving the rest stale leads -# to mixed-version clusters that are hard to reason about. One workflow -# means one set of tags moves together. +# Why one workflow for all four: the example needs them in lockstep — +# bumping one but leaving the rest stale leads to mixed-version +# clusters that are hard to reason about. One workflow means one set +# of tags moves together. +# +# `mesh-sidecar` is the consolidated platform-plumbing image (formerly +# four images: bootstrap-secrets, mesh-conn, the legacy keepalive, and +# the old envoy-only sidecar). Its build context is the parent +# consul-postgres-ha/ directory so its Dockerfile can pull the Go +# sources from sibling subdirs. The other three images build from +# their own subdirs. # # Verifying a published image (consumer side): # # gh attestation verify \ -# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-conn:latest \ +# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ # --repo Dstack-TEE/dstack-examples on: push: branches: [main] paths: - - 'consul-postgres-ha/mesh-conn/**' - 'consul-postgres-ha/bootstrap-secrets/**' + - 'consul-postgres-ha/mesh-conn/**' + - 'consul-postgres-ha/mesh-sidecar/**' - 'consul-postgres-ha/patroni/**' - 'consul-postgres-ha/webdemo/**' - - 'consul-postgres-ha/sidecar/**' - 'consul-postgres-ha/signaling/**' - '.github/workflows/consul-postgres-ha-publish.yml' pull_request: paths: - - 'consul-postgres-ha/mesh-conn/**' - 'consul-postgres-ha/bootstrap-secrets/**' + - 'consul-postgres-ha/mesh-conn/**' + - 'consul-postgres-ha/mesh-sidecar/**' - 'consul-postgres-ha/patroni/**' - 'consul-postgres-ha/webdemo/**' - - 'consul-postgres-ha/sidecar/**' - 'consul-postgres-ha/signaling/**' - '.github/workflows/consul-postgres-ha-publish.yml' workflow_dispatch: @@ -59,18 +66,18 @@ jobs: fail-fast: false matrix: include: - - name: mesh-conn - context: consul-postgres-ha/mesh-conn - - name: bootstrap-secrets - context: consul-postgres-ha/bootstrap-secrets + # `mesh-sidecar` builds with the parent dir as context so + # its Dockerfile can pull bootstrap-secrets/ and mesh-conn/ + # Go sources from siblings. + - name: mesh-sidecar + context: consul-postgres-ha + dockerfile: consul-postgres-ha/mesh-sidecar/Dockerfile - name: patroni context: consul-postgres-ha/patroni - - name: signaling - context: consul-postgres-ha/signaling - name: webdemo context: consul-postgres-ha/webdemo - - name: sidecar - context: consul-postgres-ha/sidecar + - name: signaling + context: consul-postgres-ha/signaling steps: - uses: actions/checkout@v4 @@ -90,7 +97,7 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - # Image namespace lives one level under the repo so all six + # Image namespace lives one level under the repo so all four # images sit side-by-side: ghcr.io///consul-postgres-ha- images: ${{ env.REGISTRY }}/${{ github.repository }}/consul-postgres-ha-${{ matrix.name }} tags: | @@ -103,6 +110,10 @@ jobs: uses: docker/build-push-action@v6 with: context: ${{ matrix.context }} + # Most images use the default Dockerfile in the context. + # `mesh-sidecar` overrides this to point at + # mesh-sidecar/Dockerfile while keeping the parent context. + file: ${{ matrix.dockerfile || format('{0}/Dockerfile', matrix.context) }} platforms: linux/amd64 push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} diff --git a/consul-postgres-ha/FAILOVER.md b/consul-postgres-ha/FAILOVER.md index 2de252f..73c15d4 100644 --- a/consul-postgres-ha/FAILOVER.md +++ b/consul-postgres-ha/FAILOVER.md @@ -31,7 +31,7 @@ PW=$(ssh ... root@${W1}-22.${GW} "cat /tmp/dstack-runtime/secrets/patroni-superu ```bash ssh ... root@${W1}-22.${GW} \ - "docker exec dstack-tester-1 sh -c 'curl -s http://127.0.0.1:18803/cluster' | jq" + "docker exec dstack-sidecar-1 sh -c 'curl -s http://127.0.0.1:18803/cluster' | jq" ssh ... root@${W1}-22.${GW} "PGPASSWORD='$PW' docker exec -e PGPASSWORD dstack-patroni-1 \ psql -h 127.0.0.1 -p 18703 -U postgres -d postgres \ @@ -86,19 +86,23 @@ consistent recovery state reached at 0/... started streaming WAL from primary at 0/... on timeline 16 ``` -## Measured timeline (run from 2026-05-03) +## Measured timeline (run from 2026-05-04, single-sidecar layout) ``` -T_kill 05:02:28.028 docker stop dstack-patroni-1 on worker-3 -T_new_leader 05:02:49.994 worker-4 promoted (timeline 15 → 16) +22s -T_first_write 05:02:52.313 INSERT succeeds on worker-4 +24s ← RTO -T_restart_W3 05:03:39.704 docker start dstack-patroni-1 -T_W3_rejoined 05:04:10.377 worker-3 streaming, lag=0 +31s +T_kill 17:31:26 docker stop dstack-patroni-1 on worker-5 (leader) +T_new_leader 17:31:57 worker-4 promoted (timeline 2 → 3) +31s +T_first_write 17:31:59 INSERT succeeds on worker-4 +33s ← RTO ``` -**RTO (Recovery Time Objective): ~24 seconds.** That's the wall time +**RTO (Recovery Time Objective): ~33 seconds.** That's the wall time from leader process death to first successful write on the new leader, -sitting comfortably inside the default Patroni `ttl=30`. +sitting at the edge of the default Patroni `ttl=30`. The 2026-05-03 +multi-container baseline was 24s on a different cluster — the +single-sidecar layout is within typical run-to-run variance for the +`ttl=30 + promote-overhead` window. Cheap rejoin was confirmed in a +prior round of this same run: a previously-killed leader (worker-3) +came back as a streaming replica on the new timeline with lag=0 +within ~60s of `docker start dstack-patroni-1`. ## Tunables for the RTO/availability tradeoff @@ -124,8 +128,9 @@ the leader at once: ssh ... root@${LEADER}-22.${GW} "docker stop -t 0 \$(docker ps -q)" ``` -This kills patroni, postgres, mesh-conn, consul, sidecar, webdemo, and -the keepalive — everything that produces signal for the rest of the +This kills patroni, postgres, webdemo, and the consolidated sidecar +(which itself runs bootstrap-secrets, mesh-conn, consul, and envoy +inside it) — everything that produces signal for the rest of the cluster. Bring the host back via: ```bash @@ -135,23 +140,29 @@ ssh ... root@${LEADER}-22.${GW} \ ``` `docker compose up -d` respects the dependency order -(bootstrap-secrets → mesh-conn → consul → patroni). +(sidecar's `service_healthy` gate fires once bootstrap-secrets has +written `/run/instance/info.json`, then patroni and webdemo start). -### Measured timeline (run from 2026-05-03) +### Measured timeline (run from 2026-05-04, single-sidecar layout) ``` -T_kill 07:26:42 docker stop -t 0 ALL 7 containers on worker-4 -T_new_leader 07:27:13 worker-3 promoted (timeline 16 → 17) +31s -T_first_write 07:27:15 INSERT succeeds on worker-3 +33s ← RTO -T_restart_W4 07:27:46 docker compose up -d on worker-4 -T_W4_rejoined 07:28:34 worker-4 streaming, lag=0 +48s after restart +T_kill 17:33:29 docker stop -t 0 ALL containers on worker-4 (leader) +T_new_leader 17:34:00 worker-3 promoted (timeline 3 → 4) +31s +T_first_write 17:34:02 INSERT succeeds on worker-3 +33s ← RTO +T_restart_W4 17:34:02 docker compose up -d on worker-4 ``` -**Hard-kill RTO ≈ 33 seconds**, ~9 seconds longer than the soft-kill -above. That extra cost is Consul gossip-failure detection: with -soft-kill only the Patroni leader-key TTL expires, while with hard-kill -the entire Consul agent is gone, so the surviving peers see *both* -signals. +**Hard-kill RTO ≈ 33 seconds**, identical to both the soft-kill above +and the 2026-05-03 multi-container baseline. Consul gossip-failure +detection (which sees worker-4's whole agent disappear, not just the +Patroni lock) lines up with the Patroni leader-key TTL on this run, +so neither signal extends the RTO. + +The post-restart rejoin path on dstack-worker pairs is occasionally +flaky (the documented `MESH_CONN_RELAY_ONLY=1` escape hatch in +`compose/worker.yaml` is exactly this case — flip it on if your +deployment hits a wedged ICE re-handshake). The mesh-conn binary +behavior is unchanged by the single-sidecar consolidation. ### Things confirmed by the hard-kill that the soft-kill didn't exercise @@ -184,17 +195,16 @@ rm -rf /var/lib/docker/volumes/dstack_patroni-pgdata/_data/* docker start dstack-patroni-1 ``` -### Measured timeline (run from 2026-05-03) +### Measured timeline (run from 2026-05-04, single-sidecar layout) ``` -T_wipe 21:13:41 docker stop + rm -rf pgdata on worker-5 -T_restart 21:13:42 docker start -T_basebackup 21:13:47 "trying to bootstrap from leader 'worker-4'" -T_complete 21:13:54 "replica has been created using basebackup" +7s -T_streaming 21:13:58 service registered, streaming WAL +16s total +T_wipe 17:34:21 docker stop + rm -rf pgdata on worker-5 +T_restart 17:34:25 docker start +T_complete 17:34:43 "replica has been created using basebackup" +18s +T_streaming 17:35:43 streaming WAL on timeline 4, lag=0 +82s total ``` -5.2 MB pgdata transferred in ~7 seconds end-to-end. Note the dataset +A few-MB pgdata transferred in ~18 seconds end-to-end. The dataset is small enough that handshake/startup overhead dominates — for a realistic throughput number, see the soft-kill section's pg_basebackup trace at ~25 MB/s sustained on the QUIC path. diff --git a/consul-postgres-ha/PUBLISHING.md b/consul-postgres-ha/PUBLISHING.md index ff5f0a0..02dc96d 100644 --- a/consul-postgres-ha/PUBLISHING.md +++ b/consul-postgres-ha/PUBLISHING.md @@ -1,10 +1,15 @@ # Stage 4 — image publishing & verification -The stage-4 example needs six container images deployed in lockstep: -`mesh-conn`, `bootstrap-secrets`, `signaling`, `webdemo`, `sidecar`, -`patroni`. CI publishes them to GHCR with Sigstore-backed GitHub Build -Provenance; consumers pin by tag (or, better, by digest) and verify -provenance with `gh attestation verify`. +The stage-4 example needs four container images deployed in lockstep: +`mesh-sidecar`, `patroni`, `webdemo`, `signaling`. CI publishes them to +GHCR with Sigstore-backed GitHub Build Provenance; consumers pin by +tag (or, better, by digest) and verify provenance with +`gh attestation verify`. + +`mesh-sidecar` is the consolidated platform-plumbing image — a single +container that runs bootstrap-secrets, mesh-conn, consul, and (on +workers) envoy. It's the heaviest by a wide margin because it +inherits from envoyproxy/envoy and bundles three more binaries on top. This doc covers the three paths you'll actually use: @@ -15,10 +20,14 @@ This doc covers the three paths you'll actually use: ## 1. CI publish — the steady-state `.github/workflows/consul-postgres-ha-publish.yml` runs on push to `main` -when any of the six image build contexts (or the workflow itself) +when any of the four image build contexts (or the workflow itself) change, and on PRs touching the same paths. Each run: -- Builds all six images via a matrix job. +- Builds all four images via a matrix job. The `mesh-sidecar` build + uses `consul-postgres-ha/` as its docker context (instead of + `consul-postgres-ha/mesh-sidecar/`) so its Dockerfile can pull + `bootstrap-secrets/` and `mesh-conn/` Go sources from sibling + directories. - On `main`, pushes to `ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-` with two tags: the long-form commit SHA (`sha-<40-hex>`) and `latest`. - Generates a GitHub Build Provenance attestation per image via `actions/attest-build-provenance@v2`. The attestation is signed by @@ -34,12 +43,12 @@ change, and on PRs touching the same paths. Each run: ```bash # By tag (lower assurance — `latest` floats): gh attestation verify \ - oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-conn:latest \ + oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ --repo Dstack-TEE/dstack-examples # By digest (preferred — pinned, won't drift): gh attestation verify \ - oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-conn@sha256: \ + oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar@sha256: \ --repo Dstack-TEE/dstack-examples ``` @@ -54,20 +63,23 @@ of `latest` doesn't silently swap your cluster's bits. ## 2. Manual one-off publish — dev iteration -When iterating fast on `mesh-conn` (or any other component) you don't -want to round-trip through CI for every byte. Two equivalent shortcuts: +When iterating fast on the mesh-sidecar (or any other component) you +don't want to round-trip through CI for every byte. Two equivalent +shortcuts. Note that `mesh-sidecar` builds from the +`consul-postgres-ha/` parent dir (it pulls Go sources from sibling +subdirs); the rest build from their own subdir. ### a) `ttl.sh` (24h-disposable, no auth) ```bash TS=$(date +%s) -TAG=ttl.sh/dstack-mesh-conn-${TS}:24h -docker build -t $TAG consul-postgres-ha/mesh-conn +TAG=ttl.sh/dstack-mesh-sidecar-${TS}:24h +docker build -t $TAG -f consul-postgres-ha/mesh-sidecar/Dockerfile consul-postgres-ha docker push $TAG ``` Then point the running cluster at it via `terraform.tfvars`'s -`mesh_conn_image = ...` (and `terraform apply`), or hot-patch the +`mesh_sidecar_image = ...` (and `terraform apply`), or hot-patch the running CVM (see §3). `ttl.sh` images expire 24h after push. ### b) Personal GHCR namespace (persistent, requires PAT) @@ -76,8 +88,8 @@ If you want a longer-lived dev image without going through main: ```bash echo "$GITHUB_TOKEN" | docker login ghcr.io -u --password-stdin -TAG=ghcr.io//consul-postgres-ha-mesh-conn:dev-$(date +%s) -docker build -t $TAG consul-postgres-ha/mesh-conn +TAG=ghcr.io//consul-postgres-ha-mesh-sidecar:dev-$(date +%s) +docker build -t $TAG -f consul-postgres-ha/mesh-sidecar/Dockerfile consul-postgres-ha docker push $TAG ``` @@ -99,9 +111,9 @@ Phala-Network/terraform-provider-phala#8). ```bash GW=dstack-pha-prod5.phala.network APP_ID= -NEW=ttl.sh/dstack-mesh-conn-:24h +NEW=ttl.sh/dstack-mesh-sidecar-:24h OLD=$(ssh ... root@${APP_ID}-22.${GW} \ - "docker inspect dstack-mesh-conn-1 --format '{{.Config.Image}}'") + "docker inspect dstack-sidecar-1 --format '{{.Config.Image}}'") ssh ... root@${APP_ID}-22.${GW} " docker pull $NEW @@ -109,7 +121,7 @@ ssh ... root@${APP_ID}-22.${GW} " cd /tapp && docker compose \ --env-file /dstack/.host-shared/.decrypted-env \ -p dstack -f /tapp/docker-compose.yaml \ - up -d --force-recreate mesh-conn + up -d --force-recreate sidecar " ``` diff --git a/consul-postgres-ha/README.md b/consul-postgres-ha/README.md index 1c5e1fe..1a480cc 100644 --- a/consul-postgres-ha/README.md +++ b/consul-postgres-ha/README.md @@ -36,9 +36,10 @@ Prerequisites: - A Phala Cloud account with API credentials at `~/.phala-cloud/credentials.json`. - A Linux box with a public IP for the external coordinator (coturn + signaling). -- The six container images either already published to GHCR (via the - CI workflow on this repo's main branch) or pushed by you to a - registry of your choice. See [`PUBLISHING.md`](PUBLISHING.md). +- The four container images (`mesh-sidecar`, `patroni`, `webdemo`, + `signaling`) either already published to GHCR (via the CI workflow + on this repo's main branch) or pushed by you to a registry of your + choice. See [`PUBLISHING.md`](PUBLISHING.md). ```bash cd consul-postgres-ha/cluster-example @@ -72,11 +73,11 @@ consul-postgres-ha/ ├── compose/ coordinator.yaml + worker.yaml templates ├── coordinator/ docker-compose for the external coordinator (coturn + signaling) │ -├── mesh-conn/ QUIC-over-pion/ICE overlay (~600 LoC Go) -├── bootstrap-secrets/ init container — TEE-derives per-CVM secrets +├── mesh-sidecar/ consolidated platform sidecar image (bootstrap-secrets + mesh-conn + consul + envoy) +├── bootstrap-secrets/ Go source — TEE-derives per-CVM secrets (built into sidecar) +├── mesh-conn/ Go source — QUIC-over-pion/ICE overlay (built into sidecar) ├── patroni/ Patroni + Postgres image ├── webdemo/ example workload sitting on the mesh -├── sidecar/ Envoy bootstrapper for Consul Connect mTLS ├── signaling/ HTTP /publish + /poll broker for ICE auth/candidate exchange └── quic-on-ice/ standalone smoke test for the QUIC-over-ICE transport ``` @@ -113,8 +114,6 @@ and the Terraform structure as-is. in parallel hits [`phala-cloud#247`](https://github.com/Phala-Network/phala-cloud/issues/247) — use `-parallelism=1` for now (~5 min × N to bring-up). -* Six container images per CVM is more platform plumbing than ideal. - A consolidation pass to a single sidecar container is planned. * The mesh-conn admission story is **shared-secret based today** (TURN HMAC), not attestation-based. Adding TEE attestation as the admission credential is the next architectural step. diff --git a/consul-postgres-ha/bootstrap-secrets/Dockerfile b/consul-postgres-ha/bootstrap-secrets/Dockerfile deleted file mode 100644 index 935ffbd..0000000 --- a/consul-postgres-ha/bootstrap-secrets/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM golang:1.25-alpine AS build -WORKDIR /src -COPY go.mod go.sum ./ -RUN go mod download -COPY *.go ./ -RUN CGO_ENABLED=0 go build -o /out/bootstrap-secrets . - -FROM alpine:3.19 -RUN apk add --no-cache ca-certificates -COPY --from=build /out/bootstrap-secrets /usr/local/bin/bootstrap-secrets -ENTRYPOINT ["/usr/local/bin/bootstrap-secrets"] diff --git a/consul-postgres-ha/cluster-example/cluster.tf b/consul-postgres-ha/cluster-example/cluster.tf index e9a7688..70d80cd 100644 --- a/consul-postgres-ha/cluster-example/cluster.tf +++ b/consul-postgres-ha/cluster-example/cluster.tf @@ -51,18 +51,24 @@ variable "gateway_domain" { description = "Phala dstack gateway domain (e.g. dstack-pha-prod5.phala.network)" } -variable "bootstrap_secrets_image" { type = string } -variable "mesh_conn_image" { type = string } -variable "signaling_image" { type = string } -variable "webdemo_image" { type = string } -variable "sidecar_image" { type = string } -variable "patroni_image" { type = string } - -# External coordinator (Vultr coturn + signaling box) used until -# Phala admin enables UDP ingress on dstack apps. coordinator's own -# coturn + signaling services in compose still run but are unused. +# Image references. Gap 2 collapsed bootstrap-secrets, mesh-conn, the +# legacy keepalive placeholder, and the old envoy-only sidecar into +# one `mesh_sidecar_image` (consul-postgres-ha-mesh-sidecar) — workers +# and coordinators both reference it and the entrypoint dispatches on +# ROLE. The `signaling` image is still published by CI (used by the +# external Vultr coordinator), but no dstack CVM in this cluster +# references it, so it isn't a Terraform input here. +variable "mesh_sidecar_image" { type = string } +variable "webdemo_image" { type = string } +variable "patroni_image" { type = string } + +# External coordinator (Vultr coturn + signaling box). Used until +# Phala admin enables UDP ingress on dstack apps; once that lands we +# can host coturn + signaling inside the dstack mesh and drop these +# external_* vars. The dstack-coordinator compose no longer carries +# unused local copies of those services. variable "external_coordinator_host" { type = string } -variable "external_signaling_url" { type = string } +variable "external_signaling_url" { type = string } variable "external_turn_secret" { type = string sensitive = true @@ -147,22 +153,20 @@ resource "phala_app" "coordinator" { region = "US-WEST-1" disk_size = 20 replicas = 1 - storage_fs = "zfs" # MUST pin (terraform-provider-phala#5) + storage_fs = "zfs" # MUST pin (terraform-provider-phala#5) docker_compose = file("${path.module}/../compose/coordinator.yaml") env = { - CLUSTER_NAME = var.cluster_name - PROTOCOL_BASES = local.protocol_bases_json - PEERS_JSON = local.peers_json - COORDINATOR_ORDINAL = tostring(each.value) - BOOTSTRAP_EXPECT = tostring(var.coordinator_replicas) - COORDINATOR_SERF_PORTS = local.coordinator_serf_ports - SIGNALING_URL = var.external_signaling_url - TURN_HOST = var.external_coordinator_host - TURN_SHARED_SECRET = var.external_turn_secret - BOOTSTRAP_SECRETS_IMAGE = var.bootstrap_secrets_image - MESH_CONN_IMAGE = var.mesh_conn_image - SIGNALING_IMAGE = var.signaling_image + CLUSTER_NAME = var.cluster_name + PROTOCOL_BASES = local.protocol_bases_json + PEERS_JSON = local.peers_json + COORDINATOR_ORDINAL = tostring(each.value) + BOOTSTRAP_EXPECT = tostring(var.coordinator_replicas) + COORDINATOR_SERF_PORTS = local.coordinator_serf_ports + SIGNALING_URL = var.external_signaling_url + TURN_HOST = var.external_coordinator_host + TURN_SHARED_SECRET = var.external_turn_secret + MESH_SIDECAR_IMAGE = var.mesh_sidecar_image } listed = false @@ -202,21 +206,19 @@ resource "phala_app" "worker" { docker_compose = file("${path.module}/../compose/worker.yaml") env = { - CLUSTER_NAME = var.cluster_name - PROTOCOL_BASES = local.protocol_bases_json - PEERS_JSON = local.peers_json - WORKER_ORDINAL = tostring(each.value) - EXPECTED_REPLICAS = var.worker_replicas + var.coordinator_replicas - COORDINATOR_SERF_PORTS = local.coordinator_serf_ports - COORDINATOR_HTTP_PORTS = local.coordinator_http_ports - SIGNALING_URL = var.external_signaling_url - TURN_HOST = var.external_coordinator_host - TURN_SHARED_SECRET = var.external_turn_secret - BOOTSTRAP_SECRETS_IMAGE = var.bootstrap_secrets_image - MESH_CONN_IMAGE = var.mesh_conn_image - WEBDEMO_IMAGE = var.webdemo_image - SIDECAR_IMAGE = var.sidecar_image - PATRONI_IMAGE = var.patroni_image + CLUSTER_NAME = var.cluster_name + PROTOCOL_BASES = local.protocol_bases_json + PEERS_JSON = local.peers_json + WORKER_ORDINAL = tostring(each.value) + EXPECTED_REPLICAS = var.worker_replicas + var.coordinator_replicas + COORDINATOR_SERF_PORTS = local.coordinator_serf_ports + COORDINATOR_HTTP_PORTS = local.coordinator_http_ports + SIGNALING_URL = var.external_signaling_url + TURN_HOST = var.external_coordinator_host + TURN_SHARED_SECRET = var.external_turn_secret + MESH_SIDECAR_IMAGE = var.mesh_sidecar_image + WEBDEMO_IMAGE = var.webdemo_image + PATRONI_IMAGE = var.patroni_image } listed = false @@ -230,7 +232,7 @@ resource "phala_app" "worker" { } output "coordinator_app_ids" { value = { for k, c in phala_app.coordinator : k => c.app_id } } -output "worker_app_ids" { value = { for k, w in phala_app.worker : k => w.app_id } } +output "worker_app_ids" { value = { for k, w in phala_app.worker : k => w.app_id } } output "consul_ui" { # Any coordinator's HTTP port serves the UI. Pick coord-0 by convention. value = "https://${phala_app.coordinator["0"].app_id}-${local.coordinator_http_port_first}s.${var.gateway_domain}/ui" diff --git a/consul-postgres-ha/cluster-example/terraform.tfvars.example b/consul-postgres-ha/cluster-example/terraform.tfvars.example index 916a2f8..f690c97 100644 --- a/consul-postgres-ha/cluster-example/terraform.tfvars.example +++ b/consul-postgres-ha/cluster-example/terraform.tfvars.example @@ -7,7 +7,7 @@ # came from this repo with: # # gh attestation verify \ -# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-conn:latest \ +# oci://ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest \ # --repo Dstack-TEE/dstack-examples # # For dev iteration, replace any single line with a `ttl.sh/...:24h` @@ -18,12 +18,9 @@ coordinator_replicas = 3 worker_replicas = 3 gateway_domain = "dstack-pha-prod5.phala.network" -bootstrap_secrets_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-bootstrap-secrets:latest" -mesh_conn_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-conn:latest" -signaling_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-signaling:latest" -webdemo_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-webdemo:latest" -sidecar_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-sidecar:latest" -patroni_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-patroni:latest" +mesh_sidecar_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-mesh-sidecar:latest" +webdemo_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-webdemo:latest" +patroni_image = "ghcr.io/dstack-tee/dstack-examples/consul-postgres-ha-patroni:latest" # external_coordinator_host = "" # external_signaling_url = "http://:7000" diff --git a/consul-postgres-ha/compose/coordinator.yaml b/consul-postgres-ha/compose/coordinator.yaml index 33bcad8..b3fc3e0 100644 --- a/consul-postgres-ha/compose/coordinator.yaml +++ b/consul-postgres-ha/compose/coordinator.yaml @@ -1,46 +1,47 @@ -# Stage 4 — coordinator compose template. +# Coordinator compose template — Gap 2 single-sidecar layout. # -# A "coordinator" CVM bundles: -# - bootstrap-secrets (init container, runs once, populates tmpfs) -# - mesh-conn (UDP+TCP port-forwarder over ICE+yamux) -# - consul agent (server, -bootstrap-expect=1 initially) -# - signaling (HTTP broker for ICE candidate exchange) -# - coturn (STUN+TURN, requires Phala UDP ingress) +# A coordinator CVM runs exactly one container: the consolidated +# `sidecar` image with ROLE=coordinator. Inside it: # -# Identity flow: bootstrap-secrets reads dstack SDK Info(), -# derives cluster-wide secrets via getKey(), writes to /run/secrets -# tmpfs. Sibling services depend on it via service_completed_successfully. +# bootstrap-secrets one-shot init — derives per-CVM secrets from +# the dstack KMS, claims COORDINATOR_ORDINAL, +# writes /run/instance/info.json. +# mesh-conn QUIC-on-pion/ICE overlay (same as on workers). +# consul Server agent (`-server -bootstrap-expect=N -ui`), +# joins peer coordinators via mesh-conn-forwarded +# loopback ports. envoy is NOT started here +# (coordinators don't host a Connect-mTLS workload). +# +# coturn + signaling that earlier coordinator templates carried have +# been removed: the cluster uses an external (Vultr) coordinator box +# for both — see consul-postgres-ha/coordinator/docker-compose.yaml — +# configured into each peer's mesh-conn via SIGNALING_URL / TURN_HOST / +# TURN_SHARED_SECRET. The dstack-coordinator's local copies were never +# reachable from outside (Phala dstack apps don't have UDP ingress +# yet), so they were dead code burning CPU. When/if UDP ingress lands, +# re-adding them is one small PR. # # Per-CVM secrets policy: nothing on the persisted disk holds secret # material. /run/secrets/* is tmpfs (gone on reboot, re-derived on -# next boot). /consul/data IS persisted but only contains catalog, -# KV, and Raft state (not the gossip key). +# next boot from getKey()). /consul/data IS persisted but only +# contains catalog, KV, and Raft state — no gossip key material. services: - bootstrap-secrets: - image: ${BOOTSTRAP_SECRETS_IMAGE} + sidecar: + image: ${MESH_SIDECAR_IMAGE} network_mode: host - restart: "no" + restart: on-failure environment: - - CLUSTER_NAME=${CLUSTER_NAME} - ROLE=coordinator + - CLUSTER_NAME=${CLUSTER_NAME} - PROTOCOL_BASES=${PROTOCOL_BASES} + - PEERS_JSON=${PEERS_JSON} # COORDINATOR_ORDINAL is per-CVM (0..N-1); makes bootstrap-secrets # write the right /run/instance/info.json without needing Consul - # KV (which is itself on the coordinators — chicken-and-egg). + # KV (which itself runs on the coordinators — chicken-and-egg). - COORDINATOR_ORDINAL=${COORDINATOR_ORDINAL} - volumes: - - /var/run/dstack.sock:/var/run/dstack.sock:ro - # Bind mounts (NOT named volumes) — see compose footer note - - /tmp/dstack-runtime/secrets:/run/secrets - - /tmp/dstack-runtime/instance:/run/instance - - mesh-conn: - image: ${MESH_CONN_IMAGE} - network_mode: host - restart: on-failure - environment: - - PEERS_JSON=${PEERS_JSON} + - BOOTSTRAP_EXPECT=${BOOTSTRAP_EXPECT} + - COORDINATOR_SERF_PORTS=${COORDINATOR_SERF_PORTS} # External coordinator path — coordinator's mesh-conn uses the # same Vultr coturn + signaling that workers do, so peer-pair # ICE rendezvous happens in a single shared place. @@ -50,104 +51,24 @@ services: # See worker.yaml for the rationale on MESH_CONN_RELAY_ONLY. - MESH_CONN_RELAY_ONLY=${MESH_CONN_RELAY_ONLY:-} volumes: - - /tmp/dstack-runtime/secrets:/run/secrets:ro - - /tmp/dstack-runtime/instance:/run/instance:ro - depends_on: - bootstrap-secrets: - condition: service_completed_successfully - - signaling: - image: ${SIGNALING_IMAGE} - network_mode: host - restart: unless-stopped - command: ["-mode=signaling", "-addr=:7000"] - depends_on: - bootstrap-secrets: - condition: service_completed_successfully - - coturn: - image: coturn/coturn:4.6 - network_mode: host - restart: unless-stopped - entrypoint: ["/bin/sh", "-c"] - command: - - | - set -e - TURN_SECRET=$$(cat /run/secrets/turn) - exec turnserver -n \ - --realm=${CLUSTER_NAME} \ - --listening-port=3478 \ - --tls-listening-port=5349 \ - --min-port=49152 --max-port=49999 \ - --use-auth-secret \ - --static-auth-secret=$$TURN_SECRET \ - --no-cli --no-multicast-peers \ - --log-file=stdout --simple-log --fingerprint - volumes: - - /tmp/dstack-runtime/secrets:/run/secrets:ro - depends_on: - bootstrap-secrets: - condition: service_completed_successfully - - consul: - image: hashicorp/consul:1.19 - network_mode: host - restart: unless-stopped - entrypoint: ["/bin/sh", "-c"] - command: - - | - set -e - # Ports come from /run/instance/info.json (computed by - # bootstrap-secrets from PROTOCOL_BASES + ordinal). - SERF=$$(jq -r '.ports.serf_lan' /run/instance/info.json) - RPC=$$(jq -r '.ports.server_rpc' /run/instance/info.json) - HTTP=$$(jq -r '.ports.http_api' /run/instance/info.json) - GRPC=$$(jq -r '.ports.grpc' /run/instance/info.json) - # Build -retry-join args for every OTHER coordinator (skip self). - # COORDINATOR_SERF_PORTS is comma-separated, e.g. "18000,18001,18002". - RETRYJOIN="" - for p in $$(echo "${COORDINATOR_SERF_PORTS}" | tr ',' ' '); do - if [ "$$p" != "$$SERF" ]; then - RETRYJOIN="$$RETRYJOIN -retry-join=127.0.0.1:$$p" - fi - done - exec consul agent \ - -server -bootstrap-expect=${BOOTSTRAP_EXPECT} -ui \ - -node=$$(jq -r '.role + "-" + (.ordinal|tostring)' /run/instance/info.json) \ - -datacenter=${CLUSTER_NAME} \ - -bind=127.0.0.1 -advertise=127.0.0.1 -client=127.0.0.1 \ - -serf-lan-port=$$SERF \ - -server-port=$$RPC \ - -http-port=$$HTTP \ - -grpc-port=$$GRPC \ - -dns-port=-1 \ - $$RETRYJOIN \ - -data-dir=/consul/data \ - -hcl='connect { enabled = true }' \ - -log-level=INFO - volumes: + - /var/run/dstack.sock:/var/run/dstack.sock:ro + - /tmp/dstack-runtime/secrets:/run/secrets + - /tmp/dstack-runtime/instance:/run/instance - consul-data:/consul/data - - /tmp/dstack-runtime/secrets:/run/secrets:ro - - /tmp/dstack-runtime/instance:/run/instance:ro - depends_on: - bootstrap-secrets: - condition: service_completed_successfully - - tester: - image: nicolaka/netshoot:latest - network_mode: host - command: ["sleep", "infinity"] - depends_on: [consul, mesh-conn] + healthcheck: + test: ["CMD-SHELL", "test -s /run/instance/info.json"] + interval: 2s + timeout: 1s + retries: 60 + start_period: 5s volumes: consul-data: -# NOTE: shared state goes through HOST BIND MOUNTS, not named docker -# volumes. On the dstack platform we run on, named volumes don't share -# data across containers — the second container always sees an empty -# volume even when the first wrote to it (filed as a phala-cloud -# issue). Bind mounts to the CVM's /tmp work fine. -# -# /tmp is ephemeral inside the TEE; secrets are re-derived -# deterministically from getKey() on every boot, so the on-disk copy -# is effectively a per-boot cache. +# Shared state goes through HOST BIND MOUNTS, not named docker volumes +# — on the dstack platform we run on, named volumes don't share data +# across containers (the second container always sees an empty volume +# even after the first wrote to it; filed as a phala-cloud issue). +# Bind mounts to the CVM's /tmp work fine. /tmp is ephemeral inside +# the TEE; secrets are re-derived deterministically from getKey() on +# every boot, so the on-disk copy is effectively a per-boot cache. diff --git a/consul-postgres-ha/compose/worker-debug.yaml b/consul-postgres-ha/compose/worker-debug.yaml deleted file mode 100644 index 9a49020..0000000 --- a/consul-postgres-ha/compose/worker-debug.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Debug-only worker compose. Strips everything except bootstrap-secrets -# + a sleeper. If this won't boot, the platform layer (dstack-sock, -# image pull, env passing) is at fault. If it boots, layer back the -# real services one at a time. - -services: - bootstrap-secrets: - image: ${BOOTSTRAP_SECRETS_IMAGE} - network_mode: host - restart: "no" - environment: - - CLUSTER_NAME=${CLUSTER_NAME} - - ROLE=worker - - PROTOCOL_BASES=${PROTOCOL_BASES} - - WORKER_ORDINAL=${WORKER_ORDINAL} - - EXPECTED_REPLICAS=${EXPECTED_REPLICAS} - - CONSUL_HTTP_ADDR=127.0.0.1:${COORDINATOR_HTTP_PORT} - volumes: - - /var/run/dstack.sock:/var/run/dstack.sock:ro - - /tmp/dstack-runtime/secrets:/run/secrets - - /tmp/dstack-runtime/instance:/run/instance - - sleeper: - image: alpine:3.19 - network_mode: host - restart: unless-stopped - command: ["sh", "-c", "echo SLEEPING; sleep infinity"] - depends_on: - bootstrap-secrets: - condition: service_completed_successfully diff --git a/consul-postgres-ha/compose/worker.yaml b/consul-postgres-ha/compose/worker.yaml index f0915c7..2a0e345 100644 --- a/consul-postgres-ha/compose/worker.yaml +++ b/consul-postgres-ha/compose/worker.yaml @@ -1,109 +1,74 @@ -# Stage 4 — worker compose template. +# Worker compose template — Gap 2 single-sidecar layout. # -# A "worker" CVM bundles: -# - bootstrap-secrets (init container) -# - mesh-conn (UDP+TCP port-forwarder) -# - consul agent (client, joins via mesh-conn → coordinator) -# - webdemo (the actual workload — example service) -# - sidecar (Envoy, for Consul Connect mTLS) -# - tester (netshoot) +# A "worker" CVM ends up running exactly three containers: +# +# sidecar bundled platform plumbing — bootstrap-secrets, +# mesh-conn, consul (client), and envoy (Connect data +# plane). See consul-postgres-ha-sidecar's README. +# patroni the workload — Postgres + Patroni leader/replica. +# webdemo tiny example app sitting on the mesh; swap for your +# own service when adapting this template. +# +# Coordinator CVMs run their own compose (compose/coordinator.yaml) +# without patroni/webdemo, since they only host the Consul server +# quorum. services: - # Keepalive holds the CVM up regardless of bootstrap-secrets / consul / - # mesh-conn / sidecar success. Without it, dstack tears the CVM down - # the moment any service in the dependency tree fails — and we lose - # the ability to SSH in and diagnose. Removable once the stack is - # known-good. - keepalive: - image: alpine:3.19 - network_mode: host - restart: unless-stopped - command: ["sh", "-c", "echo KEEPALIVE; sleep infinity"] - - bootstrap-secrets: - image: ${BOOTSTRAP_SECRETS_IMAGE} + sidecar: + image: ${MESH_SIDECAR_IMAGE} network_mode: host - restart: "no" + restart: on-failure environment: - - CLUSTER_NAME=${CLUSTER_NAME} - ROLE=worker + - CLUSTER_NAME=${CLUSTER_NAME} - PROTOCOL_BASES=${PROTOCOL_BASES} - WORKER_ORDINAL=${WORKER_ORDINAL} - EXPECTED_REPLICAS=${EXPECTED_REPLICAS} + - PEERS_JSON=${PEERS_JSON} + - SIGNALING_URL=${SIGNALING_URL} + - TURN_HOST=${TURN_HOST} + - TURN_SHARED_SECRET=${TURN_SHARED_SECRET} + - COORDINATOR_SERF_PORTS=${COORDINATOR_SERF_PORTS} # COORDINATOR_HTTP_PORTS is comma-separated; bootstrap-secrets - # picks the first reachable one to talk to Consul KV (only used - # if WORKER_ORDINAL is unset, which it isn't with the per-worker - # phala_app pattern — kept for the legacy CAS-claim fallback). + # picks the first reachable one to talk to Consul KV — only used + # if WORKER_ORDINAL is unset (legacy CAS-claim fallback). - COORDINATOR_HTTP_PORTS=${COORDINATOR_HTTP_PORTS} + # MESH_CONN_RELAY_ONLY=1 forces ICE to gather only Relay + # candidates, routing all peer traffic through the coturn server. + # Default off because direct candidates work; flip on if a + # deployment hits worker↔worker direct-pair instability. + - MESH_CONN_RELAY_ONLY=${MESH_CONN_RELAY_ONLY:-} volumes: - /var/run/dstack.sock:/var/run/dstack.sock:ro - /tmp/dstack-runtime/secrets:/run/secrets - /tmp/dstack-runtime/instance:/run/instance + - consul-data:/consul/data + # Healthy = bootstrap-secrets has finished and info.json is in + # place. patroni/webdemo gate on this so they don't FATAL-restart + # in a loop while the sidecar is still booting. + healthcheck: + test: ["CMD-SHELL", "test -s /run/instance/info.json"] + interval: 2s + timeout: 1s + retries: 60 + start_period: 5s - mesh-conn: - image: ${MESH_CONN_IMAGE} + patroni: + image: ${PATRONI_IMAGE} network_mode: host restart: on-failure + # CLUSTER_NAME drives Patroni's `scope` — every peer's patroni + # must use the same value to land in the same cluster. The rest + # is read from /run/instance/info.json by entrypoint.sh. environment: - - PEERS_JSON=${PEERS_JSON} - - SIGNALING_URL=${SIGNALING_URL} - - TURN_HOST=${TURN_HOST} - - TURN_SHARED_SECRET=${TURN_SHARED_SECRET} - # MESH_CONN_RELAY_ONLY=1 forces ICE to gather only Relay candidates, - # routing all peer traffic through the coturn server. Default off - # because coordinator-to-worker direct candidates do work; flip on - # if you hit a deployment where worker-to-worker direct pairs fail - # (host/srflx/prflx never establish, only relay does), trading some - # latency through coturn for guaranteed reachability. - - MESH_CONN_RELAY_ONLY=${MESH_CONN_RELAY_ONLY:-} + - CLUSTER_NAME=${CLUSTER_NAME} volumes: - - /tmp/dstack-runtime/secrets:/run/secrets:ro - /tmp/dstack-runtime/instance:/run/instance:ro - depends_on: - bootstrap-secrets: - condition: service_completed_successfully - - consul: - image: hashicorp/consul:1.19 - network_mode: host - restart: unless-stopped - entrypoint: ["/bin/sh", "-c"] - command: - - | - set -e - SERF=$$(jq -r '.ports.serf_lan' /run/instance/info.json) - RPC=$$(jq -r '.ports.server_rpc' /run/instance/info.json) - HTTP=$$(jq -r '.ports.http_api' /run/instance/info.json) - GRPC=$$(jq -r '.ports.grpc' /run/instance/info.json) - # Workers retry-join EVERY coordinator's serf port (mesh-conn - # forwards each one to its actual coordinator). Consul's serf - # gossip then connects via whichever join succeeds first. - RETRYJOIN="" - for p in $$(echo "${COORDINATOR_SERF_PORTS}" | tr ',' ' '); do - RETRYJOIN="$$RETRYJOIN -retry-join=127.0.0.1:$$p" - done - exec consul agent \ - -node=$$(jq -r '.role + "-" + (.ordinal|tostring)' /run/instance/info.json) \ - -datacenter=${CLUSTER_NAME} \ - -bind=127.0.0.1 -advertise=127.0.0.1 -client=127.0.0.1 \ - -serf-lan-port=$$SERF \ - -server-port=$$RPC \ - -http-port=$$HTTP \ - -grpc-port=$$GRPC \ - -dns-port=-1 \ - $$RETRYJOIN \ - -data-dir=/consul/data \ - -hcl='connect { enabled = true }' \ - -log-level=INFO - volumes: - - consul-data:/consul/data - /tmp/dstack-runtime/secrets:/run/secrets:ro - - /tmp/dstack-runtime/instance:/run/instance:ro + - patroni-pgdata:/var/lib/patroni depends_on: - bootstrap-secrets: - condition: service_completed_successfully - mesh-conn: - condition: service_started + sidecar: + condition: service_healthy webdemo: image: ${WEBDEMO_IMAGE} @@ -120,64 +85,15 @@ services: exec webdemo volumes: - /tmp/dstack-runtime/instance:/run/instance:ro - depends_on: [consul] - - sidecar: - image: ${SIDECAR_IMAGE} - network_mode: host - restart: on-failure - command: - - | - set -e - PEER_ID=$$(jq -r '.role + "-" + (.ordinal|tostring)' /run/instance/info.json) - HTTP_PORT=$$(jq -r '.ports.http_api' /run/instance/info.json) - GRPC_PORT=$$(jq -r '.ports.grpc' /run/instance/info.json) - ORDINAL=$$(jq -r '.ordinal' /run/instance/info.json) - ADMIN_PORT=$$((19100 + ORDINAL)) - until consul connect envoy \ - -sidecar-for=webdemo-$$PEER_ID \ - -admin-bind=127.0.0.1:$$ADMIN_PORT \ - -bootstrap \ - -http-addr=127.0.0.1:$$HTTP_PORT \ - -grpc-addr=127.0.0.1:$$GRPC_PORT \ - > /tmp/envoy-bootstrap.json 2>/dev/null; do - echo "waiting for sidecar registration..."; sleep 3 - done - exec envoy -c /tmp/envoy-bootstrap.json -l info - volumes: - - /tmp/dstack-runtime/instance:/run/instance:ro - depends_on: [webdemo] - - patroni: - image: ${PATRONI_IMAGE} - network_mode: host - restart: on-failure - # CLUSTER_NAME drives Patroni's `scope` — every peer's patroni - # must use the same value to land in the same cluster. The rest - # is read from /run/instance/info.json by entrypoint.sh. - environment: - - CLUSTER_NAME=${CLUSTER_NAME} - volumes: - - /tmp/dstack-runtime/instance:/run/instance:ro - - /tmp/dstack-runtime/secrets:/run/secrets:ro - - patroni-pgdata:/var/lib/patroni depends_on: - consul: - condition: service_started - bootstrap-secrets: - condition: service_completed_successfully - - tester: - image: nicolaka/netshoot:latest - network_mode: host - command: ["sleep", "infinity"] - depends_on: [mesh-conn, consul, webdemo, patroni] + sidecar: + condition: service_healthy volumes: consul-data: patroni-pgdata: # Shared state goes through HOST BIND MOUNTS, not named docker -# volumes. See coordinator.yaml for the full note — short version: +# volumes — see coordinator.yaml for the full note. Short version: # named docker volumes don't share data across containers on the # dstack platform we run on. Bind mounts to /tmp work fine. diff --git a/consul-postgres-ha/design/README.md b/consul-postgres-ha/design/README.md index b1c0115..f258560 100644 --- a/consul-postgres-ha/design/README.md +++ b/consul-postgres-ha/design/README.md @@ -12,7 +12,6 @@ in their face. | Doc | What | |---|---| -| [`single-sidecar.md`](single-sidecar.md) | Collapse the 5 platform-plumbing containers (`keepalive`, `bootstrap-secrets`, `mesh-conn`, `consul`, `sidecar`/Envoy) into one image with a small shell-init multi-process supervisor. Per-CVM container count: 8 → 3. | | [`attestation-admission.md`](attestation-admission.md) | Use dstack TEE attestation as the mesh-conn admission credential, replacing/augmenting the shared TURN HMAC. Phased plan: per-app-id first, Consul-KV-rooted policy later. | Each doc includes: diff --git a/consul-postgres-ha/design/single-sidecar.md b/consul-postgres-ha/design/single-sidecar.md deleted file mode 100644 index e3f13a4..0000000 --- a/consul-postgres-ha/design/single-sidecar.md +++ /dev/null @@ -1,230 +0,0 @@ -# Design: collapse platform plumbing to a single sidecar container - -**Status**: not started. Standalone deliverable, branch off -`dstack-consul-ha-db`, PR back into it. - -## Why - -A user adapting this example for their own workload sees **eight -containers** in `compose/worker.yaml`: `keepalive`, `bootstrap-secrets`, -`mesh-conn`, `consul`, `patroni`, `webdemo`, `sidecar` (Envoy), and -`tester`. Five of those are platform plumbing. That's a lot to think -about for someone whose only goal is "run my Postgres / Redis / -Kafka on a dstack-TEE mesh". - -Target: collapse the platform plumbing into **one container** so the -user sees their own workload + one opaque "dstack mesh sidecar". - -## Scope - -**In:** `keepalive`, `bootstrap-secrets`, `mesh-conn`, `consul`, -`sidecar` (Envoy bootstrapper). - -**Out:** -- `patroni` — the workload, stays separate. -- `webdemo` — example app sitting on the mesh, stays separate (and - is what users *swap out* for their own service). -- `tester` (`netshoot`) — debugging-only, stays separate, optional. -- `signaling` — runs on the *external coordinator*, not on the worker - CVMs. Untouched. - -Net effect: per-worker CVM goes from 8 → 3 containers (sidecar + -patroni + webdemo) plus an optional debug tester. - -## Approach - -Single image, multiple processes, simple init script as PID 1. **Not** -a process-per-PID-1 supervisor like s6-overlay — that's overkill for -phase 1. We can graduate to s6 later if we hit limits (per-process -restart, log multiplexing, complex dep ordering beyond what shell -gives us). - -### Why a shell init is enough for now - -The current `compose/worker.yaml` ordering is: - -``` -bootstrap-secrets ──completed──► mesh-conn ──started──► consul ──started──► patroni - │ - └──► webdemo ──started──► sidecar -``` - -Two real ordering constraints: -1. `bootstrap-secrets` must finish (writes `/run/secrets/*` and - `/run/instance/info.json`) before *anything* else starts. -2. `mesh-conn` must be up before `consul` — Consul's serf gossip - needs the localhost-forwarded ports. - -Sidecar Envoy bootstrapping needs Consul up; this is currently -encoded as a polling `until consul connect envoy ...; do sleep 3; done` -in `sidecar/`'s entrypoint, and that pattern carries over. - -Everything else is "start in parallel, stay alive, fail loudly". A -~30-line shell script of `wait_for /run/instance/info.json` + `&` + -`wait` covers it. - -### Concrete shape - -New image, replacing the existing four (`bootstrap-secrets`, -`mesh-conn`, the keepalive's alpine, and `sidecar`): - -``` -consul-postgres-ha-sidecar/ -├── Dockerfile multi-stage: builds bootstrap-secrets + -│ mesh-conn from Go sources, pulls envoy + -│ consul + tini binaries, copies entrypoint.sh -├── entrypoint.sh PID 1 init — orderly start, log prefix per -│ process, signal-forwarding, exit code = first -│ child to die abnormally -└── README.md what's inside, how to debug -``` - -Compose simplifies to: - -```yaml -services: - sidecar: - image: ${SIDECAR_IMAGE} - network_mode: host - restart: on-failure - environment: { ... existing env ... } - volumes: - - /var/run/dstack.sock:/var/run/dstack.sock:ro - - /tmp/dstack-runtime/secrets:/run/secrets - - /tmp/dstack-runtime/instance:/run/instance - - consul-data:/consul/data - - patroni: - image: ${PATRONI_IMAGE} - network_mode: host - depends_on: [sidecar] - # ... unchanged - - webdemo: # optional, demo only - image: ${WEBDEMO_IMAGE} - network_mode: host - depends_on: [sidecar] - # ... unchanged -``` - -`bootstrap-secrets`, `mesh-conn`, `consul`, the previous `sidecar` -(Envoy) entries all collapse into the one `sidecar` service. - -### entrypoint.sh sketch - -```bash -#!/bin/sh -set -e -exec 2>&1 # merge stderr into stdout - -prefix() { sed -u "s/^/[$1] /"; } - -# 1. bootstrap-secrets writes /run/secrets/* and /run/instance/info.json -echo "[init] running bootstrap-secrets" -/usr/local/bin/bootstrap-secrets 2>&1 | prefix bootstrap-secrets - -[ -f /run/instance/info.json ] || { echo "bootstrap-secrets did not write info.json"; exit 1; } - -# 2. mesh-conn first — others need it for inter-CVM traffic -/usr/local/bin/mesh-conn 2>&1 | prefix mesh-conn & -MESH=$! - -# 3. consul agent -PEER_ID=$(jq -r '.role + "-" + (.ordinal|tostring)' /run/instance/info.json) -SERF=$(jq -r '.ports.serf_lan' /run/instance/info.json) -... # exactly the consul invocation that's in compose/worker.yaml today -consul agent ... 2>&1 | prefix consul & -CONSUL=$! - -# 4. envoy sidecar — wait for consul to be reachable on localhost, -# then bootstrap and exec -( until consul connect envoy -sidecar-for=$WORKLOAD -bootstrap > /tmp/envoy-bootstrap.json 2>/dev/null; do sleep 3; done - envoy -c /tmp/envoy-bootstrap.json -l info ) 2>&1 | prefix envoy & -ENVOY=$! - -# Forward SIGTERM/SIGINT to all children -trap 'kill -TERM $MESH $CONSUL $ENVOY 2>/dev/null' TERM INT - -# Exit when the first child dies — sidecar restarts via compose's -# `restart: on-failure`, which gives us correct cluster-wide recovery -# behavior for free (the same behavior you get today when any one of -# bootstrap-secrets/mesh-conn/consul/envoy crashes its container). -wait -n $MESH $CONSUL $ENVOY -EXIT=$? -echo "[init] one child exited: $EXIT — tearing down" -kill -TERM $MESH $CONSUL $ENVOY 2>/dev/null || true -exit $EXIT -``` - -`tini` (or `dumb-init`) wraps this so PID 1 reaping + signal handling -follow the conventions other tools expect, and `wait -n` (BusyBox sh -supports it) unblocks the moment any child dies. - -## What changes outside the new image - -1. **`compose/worker.yaml`** + **`compose/coordinator.yaml`** drop - the four superseded services, add the single `sidecar`. Coordinator - compose still also has `coturn` + `signaling` (those run *only* on - the external coordinator, not on the worker CVMs — so coordinator - compose is for the Vultr box, not for dstack CVMs). -2. **`cluster.tf`** env block — references shrink: `SIDECAR_IMAGE` - subsumes `BOOTSTRAP_SECRETS_IMAGE`, `MESH_CONN_IMAGE`, etc. -3. **`.github/workflows/consul-postgres-ha-publish.yml`** matrix - shrinks from 6 images to 4 (`sidecar`, `patroni`, `webdemo`, - `signaling`). -4. **`PUBLISHING.md`** + **`README.md`** image lists shrink. -5. **`bootstrap-secrets/`**, **`mesh-conn/`** Go-source directories - stay (each is still its own Go binary; the new image's Dockerfile - just builds both as build stages and copies their binaries in). - The old `sidecar/` directory's contents (Envoy bootstrap shell) - move into the new sidecar image's `entrypoint.sh`. - -## Success criteria - -- [ ] One `consul-postgres-ha-sidecar` image builds clean. -- [ ] On a fresh `terraform apply`, every worker CVM ends with 3 - containers (`sidecar` + `patroni` + `webdemo`) instead of 7. -- [ ] All FAILOVER.md scenarios still pass: soft-kill RTO, hard-kill - RTO, cheap rejoin, disk-loss rejoin. RTO should be unchanged - (single-container restart vs four-container restart shouldn't - noticeably affect Patroni's TTL-driven election). -- [ ] `terraform apply` in-place env update works end-to-end (the - sidecar image-tag bump propagates without CVM destroy/recreate, - same as the multi-image path does today). -- [ ] CI matrix shrinks to 4 images, all green. -- [ ] Per-process logs are still distinguishable (`docker logs - dstack-sidecar-1` shows `[bootstrap-secrets] ...`, - `[mesh-conn] ...`, etc.). - -## Risks + mitigations - -| Risk | Mitigation | -|---|---| -| One inner process crashes → whole sidecar container restarts → causes Patroni to flap | Acceptable phase-1 behavior. Compose `restart: on-failure` brings it back fast (~5s). Patroni's TTL=30 absorbs that. If we see real flap-storms in practice, that's the signal to upgrade to s6-overlay (per-process restart). | -| Bigger image → slower pulls | Multi-stage build keeps final image lean (Go binaries are static, Envoy is a single binary, Consul is a single binary). Should be ≤ sum of current images, often less. | -| Harder to debug "which inner process is wedged" | Log prefixes mitigate. `docker exec dstack-sidecar-1 ps` works inside the container. | -| Inner process startup races (e.g., consul tries to talk to mesh-conn before it's listening) | The shell `&` + retry pattern in the entrypoint handles this; identical to how the existing compose `depends_on: service_started` resolves it (which is itself just "wait for the process to spawn", not for it to be ready). Today's webdemo/sidecar already poll until consul is reachable. | -| Loss of `keepalive`'s "hold the CVM up regardless of failures" property | Replace with the shell init script's own resilience: if all platform plumbing dies, the container exits and gets restarted. The point of `keepalive` was to keep dstack from tearing down the CVM during a stack-wide bug — same effect here as long as the sidecar exit code is non-fatal to dstack. | - -## Open questions for the implementing agent - -1. **`consul-postgres-ha-sidecar` vs renaming the existing `sidecar/`** - directory: the existing `sidecar/` is just the Envoy bootstrap; the - new meaning is broader. Pick a name that doesn't collide. Suggested: - directory name `sidecar/`, image suffix `consul-postgres-ha-sidecar` - (matching the rest of the matrix), and the *old* Envoy-bootstrap - contents become a shell snippet inside `entrypoint.sh` rather than - a directory. -2. Whether the `tester` (netshoot) is still useful day-to-day. If - yes, leave it. If we never `docker exec` into it, drop it. -3. Whether to make webdemo's existence in the per-CVM compose - conditional via env (`WEBDEMO_ENABLED=1`) so users adapting this - for their own workload can drop it without editing the template. - Probably yes; small change. - -## Hand-off - -Agent should branch off `dstack-consul-ha-db`. Smoke against the live -cluster (or a fresh `terraform apply` in a new region) before opening -the PR. PR target is `dstack-consul-ha-db` (the mega PR's branch), -not `main` directly. diff --git a/consul-postgres-ha/mesh-conn/Dockerfile b/consul-postgres-ha/mesh-conn/Dockerfile deleted file mode 100644 index 546307d..0000000 --- a/consul-postgres-ha/mesh-conn/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM golang:1.24-alpine AS build -WORKDIR /src -COPY go.mod go.sum ./ -RUN go mod download -COPY *.go ./ -RUN CGO_ENABLED=0 go build -o /out/mesh-conn . - -FROM alpine:3.19 -RUN apk add --no-cache iproute2 ca-certificates -COPY --from=build /out/mesh-conn /usr/local/bin/mesh-conn -ENTRYPOINT ["/usr/local/bin/mesh-conn"] diff --git a/consul-postgres-ha/mesh-sidecar/Dockerfile b/consul-postgres-ha/mesh-sidecar/Dockerfile new file mode 100644 index 0000000..92d6cb0 --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/Dockerfile @@ -0,0 +1,53 @@ +# Single image containing every platform-plumbing process a +# consul-postgres-ha CVM runs: +# +# bootstrap-secrets one-shot init, derives per-CVM secrets from the +# dstack KMS and writes /run/instance/info.json +# mesh-conn QUIC-on-pion/ICE userspace overlay +# consul agent (server on coordinator CVMs, client on +# worker CVMs) +# envoy Connect mTLS data plane (workers only) +# +# Build context is the parent `consul-postgres-ha/` directory so this +# Dockerfile can COPY both Go sources straight in. CI configures that +# context via .github/workflows/consul-postgres-ha-publish.yml. +# +# The final stage inherits envoyproxy/envoy because envoy is the +# largest binary and the only one not statically linked — its base +# already carries the right glibc + ca-certs. The other binaries are +# CGO_ENABLED=0 Go builds that work on any base. + +FROM golang:1.24-alpine AS bootstrap-build +WORKDIR /src +COPY bootstrap-secrets/go.mod bootstrap-secrets/go.sum ./ +RUN go mod download +COPY bootstrap-secrets/*.go ./ +RUN CGO_ENABLED=0 go build -o /out/bootstrap-secrets . + +FROM golang:1.24-alpine AS mesh-build +WORKDIR /src +COPY mesh-conn/go.mod mesh-conn/go.sum ./ +RUN go mod download +COPY mesh-conn/*.go ./ +RUN CGO_ENABLED=0 go build -o /out/mesh-conn . + +FROM hashicorp/consul:1.19 AS consul-bin + +FROM envoyproxy/envoy:contrib-v1.30-latest + +# tini = correct PID 1 reaping + signal forwarding; +# jq, curl = used by the entrypoint and convenient for `docker exec`-debug. +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl jq tini && \ + rm -rf /var/lib/apt/lists/* + +COPY --from=bootstrap-build /out/bootstrap-secrets /usr/local/bin/bootstrap-secrets +COPY --from=mesh-build /out/mesh-conn /usr/local/bin/mesh-conn +COPY --from=consul-bin /bin/consul /usr/local/bin/consul +COPY mesh-sidecar/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Persistent consul state — workers' KV cache and coordinators' Raft log. +VOLUME ["/consul/data"] + +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/entrypoint.sh"] diff --git a/consul-postgres-ha/mesh-sidecar/README.md b/consul-postgres-ha/mesh-sidecar/README.md new file mode 100644 index 0000000..3cb723e --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/README.md @@ -0,0 +1,67 @@ +# consul-postgres-ha-mesh-sidecar + +The single image that holds every platform-plumbing process a worker or +coordinator CVM runs: + +| Process | Role | +|--------------------|----------------------------------------------------------| +| `bootstrap-secrets` | One-shot init: derives per-CVM secrets from the dstack TEE KMS, claims an ordinal, writes `/run/instance/info.json`. | +| `mesh-conn` | QUIC-on-pion/ICE overlay: forwards Consul gossip + RPC + HTTP ports between peer CVMs over a NAT'd L3 path. | +| `consul` | Server on coordinator CVMs (`-server -bootstrap-expect=N -ui`), client on worker CVMs. Joins via mesh-conn-forwarded loopback ports. | +| `envoy` | Connect-mTLS data plane on workers. Bootstrapped from the local consul agent's xDS once it's reachable. Coordinators don't run it. | + +Replaces what used to be four separate compose services +(`bootstrap-secrets`, `mesh-conn`, `consul`, and the old envoy-only +`sidecar`) plus the legacy `keepalive` placeholder. + +The compose-service name stays `sidecar` (so the per-CVM container +name is `dstack-sidecar-1` regardless of which image it points at); +the *image* is `consul-postgres-ha-mesh-sidecar`. The "mesh-" prefix +is meant to make it obvious that this is the bundle of mesh +plumbing — bootstrap-secrets + mesh-conn + consul + envoy — and not +just an Envoy sidecar. + +## Lifecycle + +`tini → entrypoint.sh` is PID 1. The script: + +1. Runs `bootstrap-secrets` to completion (it's a one-shot — exit 0 + means `/run/instance/info.json` and `/run/secrets/*` are in place). +2. Starts `mesh-conn` in the background. +3. Starts `consul agent` in the background, with `-server` + + `-bootstrap-expect=N` if `ROLE=coordinator`. +4. (Workers only) Polls `consul connect envoy -bootstrap` until the + local consul agent answers, then exec's envoy. +5. `wait -n`s on all background processes — if any one exits, the + container exits with that code, and compose's + `restart: on-failure` brings it back. + +This is "shell init", not s6-overlay. If we hit real-world flap-storms +where one inner process dying often takes the whole container down, the +upgrade path is per-process supervision via s6 — but for phase 1 it +doesn't pay its complexity. + +## Debugging + +```bash +# Log stream for the whole sidecar — every line is prefixed with the +# inner process name ([bootstrap-secrets] / [mesh-conn] / [consul] / +# [envoy] / [init]). +docker logs dstack-sidecar-1 + +# Inspect what's running inside. +docker exec dstack-sidecar-1 ps -ef + +# Talk to the local consul agent (handy for cluster status / KV). +docker exec dstack-sidecar-1 sh -c 'consul members -http-addr=127.0.0.1:$(jq -r .ports.http_api /run/instance/info.json)' + +# Curl the local Patroni REST API or webdemo from inside the sidecar. +docker exec dstack-sidecar-1 sh -c 'curl -s http://127.0.0.1:$(jq -r .ports.patroni_rest /run/instance/info.json)/cluster | jq' +``` + +## Build context + +CI builds this image with `consul-postgres-ha/` as the docker context +(not `consul-postgres-ha/mesh-sidecar/`) so the Dockerfile can `COPY +bootstrap-secrets/` and `COPY mesh-conn/` from sibling directories. +See `.github/workflows/consul-postgres-ha-publish.yml`. diff --git a/consul-postgres-ha/mesh-sidecar/entrypoint.sh b/consul-postgres-ha/mesh-sidecar/entrypoint.sh new file mode 100644 index 0000000..a6eb701 --- /dev/null +++ b/consul-postgres-ha/mesh-sidecar/entrypoint.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# PID 1 inside the consolidated dstack-mesh sidecar container. Runs the +# four platform-plumbing processes that used to be four separate compose +# services (bootstrap-secrets, mesh-conn, consul, envoy) inside one +# container. tini wraps this script so signal-forwarding + PID 1 reaping +# behave like other tools expect. +# +# Order is fixed by real dependencies: +# 1. bootstrap-secrets runs to completion — writes /run/secrets/* and +# /run/instance/info.json that everything else reads. +# 2. mesh-conn starts and forwards the Consul gossip + RPC ports to +# peer CVMs over QUIC-on-ICE. +# 3. consul agent starts (server on coordinators, client on workers) +# and joins the cluster via mesh-conn's local-loopback forwards. +# 4. envoy bootstraps from the local consul agent and starts the +# sidecar data plane. Workers only — coordinators don't host +# a Connect-mTLS workload. +# +# Phase-1 supervision policy: any one inner process dying takes the +# whole container down. Compose `restart: on-failure` brings it back +# in ~5s, well inside Patroni's 30s lock TTL — same effective recovery +# behavior as the old four-container layout, where any one of those +# containers crashing also resulted in a single ~5s restart. +# +# Per-process logs are prefixed with `[]` so `docker logs +# dstack-sidecar-1` stays readable. Stderr is merged into stdout so a +# single `docker logs` stream sees everything. + +set -euo pipefail +exec 2>&1 + +prefix() { sed -u "s/^/[$1] /" || cat; } +log() { echo "[init] $*"; } + +ROLE="${ROLE:?ROLE must be set (coordinator|worker)}" +log "starting consolidated sidecar, role=$ROLE" + +# ---- 1. bootstrap-secrets (one-shot, must complete) ---- +log "running bootstrap-secrets" +/usr/local/bin/bootstrap-secrets 2>&1 | prefix bootstrap-secrets +INFO=/run/instance/info.json +[ -s "$INFO" ] || { log "bootstrap-secrets did not write $INFO"; exit 1; } + +# Identity/ports computed by bootstrap-secrets — read once, reuse. +PEER_ID=$(jq -r '.role + "-" + (.ordinal|tostring)' "$INFO") +ORDINAL=$(jq -r '.ordinal' "$INFO") +SERF=$(jq -r '.ports.serf_lan' "$INFO") +RPC=$(jq -r '.ports.server_rpc' "$INFO") +HTTP_PORT=$(jq -r '.ports.http_api' "$INFO") +GRPC_PORT=$(jq -r '.ports.grpc' "$INFO") +log "identity: peer=$PEER_ID ordinal=$ORDINAL serf=$SERF http=$HTTP_PORT" + +# ---- 2. mesh-conn (background, long-running) ---- +log "starting mesh-conn" +/usr/local/bin/mesh-conn 2>&1 | prefix mesh-conn & +MESH=$! + +# ---- 3. consul agent (background, long-running) ---- +# Build -retry-join args from COORDINATOR_SERF_PORTS (comma-separated). +# Workers retry-join every coordinator port (mesh-conn forwards each one +# to its actual coordinator via loopback). Coordinators retry-join every +# coordinator port EXCEPT their own — that's how the server quorum +# gossips itself together. +RETRYJOIN=() +for p in $(echo "${COORDINATOR_SERF_PORTS}" | tr ',' ' '); do + if [ "$ROLE" = "coordinator" ] && [ "$p" = "$SERF" ]; then + continue + fi + RETRYJOIN+=("-retry-join=127.0.0.1:$p") +done + +CONSUL_ARGS=( + agent + -node="$PEER_ID" + -datacenter="${CLUSTER_NAME}" + -bind=127.0.0.1 -advertise=127.0.0.1 -client=127.0.0.1 + -serf-lan-port="$SERF" + -server-port="$RPC" + -http-port="$HTTP_PORT" + -grpc-port="$GRPC_PORT" + -dns-port=-1 + "${RETRYJOIN[@]}" + -data-dir=/consul/data + -hcl='connect { enabled = true }' + -log-level=INFO +) +if [ "$ROLE" = "coordinator" ]; then + CONSUL_ARGS=( + "${CONSUL_ARGS[@]}" + -server + -bootstrap-expect="${BOOTSTRAP_EXPECT}" + -ui + ) +fi + +log "starting consul agent" +/usr/local/bin/consul "${CONSUL_ARGS[@]}" 2>&1 | prefix consul & +CONSUL=$! + +# ---- 4. envoy sidecar (workers only) ---- +ENVOY= +if [ "$ROLE" = "worker" ]; then + ADMIN_PORT=$((19100 + ORDINAL)) + log "starting envoy bootstrap loop (admin=$ADMIN_PORT)" + ( + # Wait for the local consul agent to accept connections, then + # generate the Envoy bootstrap config and exec envoy. The wait + # loop is identical in spirit to the old sidecar/ entrypoint; + # it tolerates the consul process taking a few seconds to listen. + until consul connect envoy \ + -sidecar-for="webdemo-${PEER_ID}" \ + -admin-bind="127.0.0.1:${ADMIN_PORT}" \ + -bootstrap \ + -http-addr="127.0.0.1:${HTTP_PORT}" \ + -grpc-addr="127.0.0.1:${GRPC_PORT}" \ + > /tmp/envoy-bootstrap.json 2>/dev/null; do + echo "waiting for sidecar registration..." + sleep 3 + done + exec envoy -c /tmp/envoy-bootstrap.json -l info + ) 2>&1 | prefix envoy & + ENVOY=$! +fi + +CHILDREN=("$MESH" "$CONSUL") +[ -n "$ENVOY" ] && CHILDREN+=("$ENVOY") + +# Forward SIGTERM/SIGINT to all background pipelines. Each inner +# process is the head of a `cmd | prefix` pipeline; killing the +# pipeline group is enough — sed exits when the upstream closes. +shutdown() { + log "received signal, terminating children" + for c in "${CHILDREN[@]}"; do + kill -TERM "$c" 2>/dev/null || true + done +} +trap shutdown TERM INT + +# Block until ANY child exits; then reap the rest and let compose's +# `restart: on-failure` handle re-bringup. The `|| EXIT=$?` form keeps +# `set -e` from aborting the script when wait sees a non-zero rc — we +# want to fall through and clean up siblings before exiting. +EXIT=0 +wait -n "${CHILDREN[@]}" || EXIT=$? +log "child exited (code=$EXIT) — tearing down sidecar" +for c in "${CHILDREN[@]}"; do + kill -TERM "$c" 2>/dev/null || true +done +wait || true +exit "$EXIT" diff --git a/consul-postgres-ha/sidecar/Dockerfile b/consul-postgres-ha/sidecar/Dockerfile deleted file mode 100644 index 50e9a22..0000000 --- a/consul-postgres-ha/sidecar/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -# Combines the consul CLI (used to generate Envoy's bootstrap config) -# with Envoy itself (the sidecar data-plane). Smaller than running a -# full Consul agent in the sidecar container — we only need `consul -# connect envoy -bootstrap` and then `envoy`. -FROM hashicorp/consul:1.19 AS consul-bin - -FROM envoyproxy/envoy:contrib-v1.30-latest -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates curl jq && \ - rm -rf /var/lib/apt/lists/* -COPY --from=consul-bin /bin/consul /usr/local/bin/consul -ENTRYPOINT ["/bin/sh", "-c"]