From 91c659d37708e5e743017a1590463a7cfad0bf42 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 13:33:28 -0400 Subject: [PATCH 1/8] Scrape post-mission metrics via per-pod svc.cluster.local DNS The teardown metrics dump (DumpPeerMetrics -> GetRawMetrics -> Peer.fetch) scraped stellar-core's admin HTTP endpoint through the ingress hostname (., defaulting to .local). That name does not resolve in all cluster-DNS environments (e.g. non-SDF k3s clusters), so missions exited non-zero during teardown even when the consensus/loadgen logic succeeded. Scrape the raw metrics directly from the per-pod svc.cluster.local DNS name on the core HTTP port (11626) instead, bypassing the ingress. These per-pod service DNS names already work and are used elsewhere in SSC config generation. Fixes #399 Co-Authored-By: Claude Opus 4.8 --- src/FSLibrary/StellarCoreHTTP.fs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index 66d58fae..44409974 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -392,6 +392,17 @@ type Peer with let url = self.URL path Http.RequestString(url, headers = self.Headers) + // URL that reaches stellar-core's admin HTTP endpoint directly via the + // per-pod svc.cluster.local DNS name, bypassing the ingress. The ingress + // hostname (., e.g. .local) does not + // resolve in all cluster-DNS environments (for example non-SDF k3s + // clusters), whereas the per-pod service DNS name resolves wherever cluster + // DNS is reachable. See https://github.com/stellar/supercluster/issues/399. + member self.ClusterDnsURL(path: string) : string = + sprintf "http://%s:%d/%s" self.DnsName.StringName StellarCoreCfg.CfgVal.httpPort path + + member self.fetchFromClusterDns(path: string) : string = Http.RequestString(self.ClusterDnsURL path) + member self.GetState() = self.GetInfo().State member self.GetStatusOrState() : string = @@ -401,7 +412,11 @@ type Peer with member self.GetMetrics() : Metrics.Metrics = WebExceptionRetry DefaultRetry (fun _ -> Metrics.Parse(self.fetch "metrics").Metrics) - member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetch "metrics") + // Post-mission metrics are scraped directly from the per-pod + // svc.cluster.local DNS name (see ClusterDnsURL) rather than through the + // ingress, so the teardown dump succeeds even when the ingress hostname is + // not resolvable from where SSC runs. + member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetchFromClusterDns "metrics") member self.GetInfo() : Info.Info = WebExceptionRetry From 1794e047c54c99ee720e68cfd2d25fc20b38e382 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 13:43:25 -0400 Subject: [PATCH 2/8] Make cluster-DNS metrics scrape opt-in via --metrics-via-cluster-dns The initial hard switch to the per-pod svc.cluster.local DNS name regressed out-of-cluster runs (including CI's BootAndSync mission): SSC runs outside the cluster via kubeconfig, so cluster-internal DNS does not resolve from there and the teardown scrape failed with repeated name-resolution errors. Gate the behavior behind a new --metrics-via-cluster-dns flag (default false). By default the post-mission metrics scrape continues to use the ingress hostname as before; environments where the ingress hostname does not resolve but cluster DNS does (e.g. non-SDF k3s clusters) can opt in. Co-Authored-By: Claude Opus 4.8 --- src/App/Program.fs | 11 +++++++++++ src/FSLibrary.Tests/Tests.fs | 1 + src/FSLibrary/StellarCoreHTTP.fs | 15 ++++++++++----- src/FSLibrary/StellarMissionContext.fs | 1 + 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/App/Program.fs b/src/App/Program.fs index 83ade839..9ac5203b 100644 --- a/src/App/Program.fs +++ b/src/App/Program.fs @@ -46,6 +46,7 @@ type MissionOptions ingressInternalDomain: string, ingressExternalHost: string option, ingressExternalPort: int, + metricsViaClusterDns: bool, exportToPrometheus: bool, probeTimeout: int, missions: string seq, @@ -178,6 +179,15 @@ type MissionOptions Default = 80)>] member self.IngressExternalPort = ingressExternalPort + [.local) does not resolve from where SSC runs but cluster DNS does.", + Required = false, + Default = false)>] + member self.MetricsViaClusterDns = metricsViaClusterDns + [] member self.ExportToPrometheus : bool = exportToPrometheus @@ -779,6 +789,7 @@ let main argv = ingressInternalDomain = mission.IngressInternalDomain ingressExternalHost = mission.IngressExternalHost ingressExternalPort = mission.IngressExternalPort + metricsViaClusterDns = mission.MetricsViaClusterDns exportToPrometheus = mission.ExportToPrometheus probeTimeout = mission.ProbeTimeout coreResources = SmallTestResources diff --git a/src/FSLibrary.Tests/Tests.fs b/src/FSLibrary.Tests/Tests.fs index ce280d8b..73c2145a 100644 --- a/src/FSLibrary.Tests/Tests.fs +++ b/src/FSLibrary.Tests/Tests.fs @@ -58,6 +58,7 @@ let ctx : MissionContext = ingressInternalDomain = "local" ingressExternalHost = None ingressExternalPort = 80 + metricsViaClusterDns = false exportToPrometheus = false probeTimeout = 10 coreResources = SmallTestResources diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index 44409974..be7066b4 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -412,11 +412,16 @@ type Peer with member self.GetMetrics() : Metrics.Metrics = WebExceptionRetry DefaultRetry (fun _ -> Metrics.Parse(self.fetch "metrics").Metrics) - // Post-mission metrics are scraped directly from the per-pod - // svc.cluster.local DNS name (see ClusterDnsURL) rather than through the - // ingress, so the teardown dump succeeds even when the ingress hostname is - // not resolvable from where SSC runs. - member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetchFromClusterDns "metrics") + // Post-mission metrics are scraped through the ingress by default. When + // --metrics-via-cluster-dns is set they are instead scraped directly from + // the per-pod svc.cluster.local DNS name (see ClusterDnsURL), so the + // teardown dump succeeds in environments where the ingress hostname (e.g. + // .local) does not resolve from where SSC runs but cluster DNS does. + member self.GetRawMetrics() = + if self.networkCfg.missionContext.metricsViaClusterDns then + WebExceptionRetry DefaultRetry (fun _ -> self.fetchFromClusterDns "metrics") + else + WebExceptionRetry DefaultRetry (fun _ -> self.fetch "metrics") member self.GetInfo() : Info.Info = WebExceptionRetry diff --git a/src/FSLibrary/StellarMissionContext.fs b/src/FSLibrary/StellarMissionContext.fs index c64b4cca..5152e2c4 100644 --- a/src/FSLibrary/StellarMissionContext.fs +++ b/src/FSLibrary/StellarMissionContext.fs @@ -59,6 +59,7 @@ type MissionContext = ingressInternalDomain: string ingressExternalHost: string option ingressExternalPort: int + metricsViaClusterDns: bool exportToPrometheus: bool probeTimeout: int coreResources: CoreResources From 3324dbca184672fc755c2d40a6852bf3a1313996 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 13:51:12 -0400 Subject: [PATCH 3/8] Format metrics-via-cluster-dns option to match house style fantomas --check flagged the multi-line concatenated HelpText. Collapse it to a single-line string like the other Option HelpText attributes in this file. Co-Authored-By: Claude Opus 4.8 --- src/App/Program.fs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/App/Program.fs b/src/App/Program.fs index 9ac5203b..36d916ea 100644 --- a/src/App/Program.fs +++ b/src/App/Program.fs @@ -180,10 +180,7 @@ type MissionOptions member self.IngressExternalPort = ingressExternalPort [.local) does not resolve from where SSC runs but cluster DNS does.", + HelpText = "Scrape post-mission metrics directly from the per-pod svc.cluster.local DNS name instead of through the ingress hostname. Use when the ingress hostname (e.g. .local) does not resolve from where SSC runs but cluster DNS does.", Required = false, Default = false)>] member self.MetricsViaClusterDns = metricsViaClusterDns From 68be4f889902c24802bce308550125ee993f8d98 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 14:24:08 -0400 Subject: [PATCH 4/8] Scrape post-mission metrics via pod exec + curl (best-effort) Replace the URL/ingress-based metrics scrape with a `kubectl exec`-style call that runs `curl http://localhost:11626/metrics` inside each stellar-core pod, mirroring how DumpPeerDatabase already exec's sqlite3. This works in every environment SSC runs in, including a runner outside a non-SDF k3s cluster (e.g. Namespace/nsc): pod exec only needs the Kubernetes API that SSC already uses to manage the cluster, so it does not depend on the ingress `.local` hostname or on cluster-internal DNS (`svc.cluster.local`) being resolvable from the runner -- neither of which resolves from an out-of-cluster runner. The scrape is also now best-effort: a failure logs a warning instead of throwing, so a transient or environment-specific scrape problem no longer makes an otherwise-successful mission exit non-zero (the actual impact in #399, which blocks automated pass/fail gating). This supersedes the earlier --metrics-via-cluster-dns approach, which only helped a runner that could already resolve cluster DNS (i.e. in-cluster), so those changes are reverted. Fixes #399 Co-Authored-By: Claude Opus 4.8 --- src/App/Program.fs | 8 ----- src/FSLibrary.Tests/Tests.fs | 1 - src/FSLibrary/StellarCoreHTTP.fs | 22 +------------ src/FSLibrary/StellarDataDump.fs | 45 ++++++++++++++++++++++++-- src/FSLibrary/StellarMissionContext.fs | 1 - 5 files changed, 43 insertions(+), 34 deletions(-) diff --git a/src/App/Program.fs b/src/App/Program.fs index 36d916ea..83ade839 100644 --- a/src/App/Program.fs +++ b/src/App/Program.fs @@ -46,7 +46,6 @@ type MissionOptions ingressInternalDomain: string, ingressExternalHost: string option, ingressExternalPort: int, - metricsViaClusterDns: bool, exportToPrometheus: bool, probeTimeout: int, missions: string seq, @@ -179,12 +178,6 @@ type MissionOptions Default = 80)>] member self.IngressExternalPort = ingressExternalPort - [] - member self.MetricsViaClusterDns = metricsViaClusterDns - [] member self.ExportToPrometheus : bool = exportToPrometheus @@ -786,7 +779,6 @@ let main argv = ingressInternalDomain = mission.IngressInternalDomain ingressExternalHost = mission.IngressExternalHost ingressExternalPort = mission.IngressExternalPort - metricsViaClusterDns = mission.MetricsViaClusterDns exportToPrometheus = mission.ExportToPrometheus probeTimeout = mission.ProbeTimeout coreResources = SmallTestResources diff --git a/src/FSLibrary.Tests/Tests.fs b/src/FSLibrary.Tests/Tests.fs index 73c2145a..ce280d8b 100644 --- a/src/FSLibrary.Tests/Tests.fs +++ b/src/FSLibrary.Tests/Tests.fs @@ -58,7 +58,6 @@ let ctx : MissionContext = ingressInternalDomain = "local" ingressExternalHost = None ingressExternalPort = 80 - metricsViaClusterDns = false exportToPrometheus = false probeTimeout = 10 coreResources = SmallTestResources diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index be7066b4..66d58fae 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -392,17 +392,6 @@ type Peer with let url = self.URL path Http.RequestString(url, headers = self.Headers) - // URL that reaches stellar-core's admin HTTP endpoint directly via the - // per-pod svc.cluster.local DNS name, bypassing the ingress. The ingress - // hostname (., e.g. .local) does not - // resolve in all cluster-DNS environments (for example non-SDF k3s - // clusters), whereas the per-pod service DNS name resolves wherever cluster - // DNS is reachable. See https://github.com/stellar/supercluster/issues/399. - member self.ClusterDnsURL(path: string) : string = - sprintf "http://%s:%d/%s" self.DnsName.StringName StellarCoreCfg.CfgVal.httpPort path - - member self.fetchFromClusterDns(path: string) : string = Http.RequestString(self.ClusterDnsURL path) - member self.GetState() = self.GetInfo().State member self.GetStatusOrState() : string = @@ -412,16 +401,7 @@ type Peer with member self.GetMetrics() : Metrics.Metrics = WebExceptionRetry DefaultRetry (fun _ -> Metrics.Parse(self.fetch "metrics").Metrics) - // Post-mission metrics are scraped through the ingress by default. When - // --metrics-via-cluster-dns is set they are instead scraped directly from - // the per-pod svc.cluster.local DNS name (see ClusterDnsURL), so the - // teardown dump succeeds in environments where the ingress hostname (e.g. - // .local) does not resolve from where SSC runs but cluster DNS does. - member self.GetRawMetrics() = - if self.networkCfg.missionContext.metricsViaClusterDns then - WebExceptionRetry DefaultRetry (fun _ -> self.fetchFromClusterDns "metrics") - else - WebExceptionRetry DefaultRetry (fun _ -> self.fetch "metrics") + member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetch "metrics") member self.GetInfo() : Info.Info = WebExceptionRetry diff --git a/src/FSLibrary/StellarDataDump.fs b/src/FSLibrary/StellarDataDump.fs index c423b473..e7d6dd08 100644 --- a/src/FSLibrary/StellarDataDump.fs +++ b/src/FSLibrary/StellarDataDump.fs @@ -240,10 +240,49 @@ type StellarFormation with Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore with x -> () + // Scrape post-mission metrics by exec'ing `curl` against stellar-core's + // admin HTTP endpoint on the pod loopback, rather than fetching through the + // ingress. The ingress hostname (e.g. .local) is not resolvable from + // every environment SSC runs in (for example a runner outside a non-SDF k3s + // cluster), whereas pod exec only needs the Kubernetes API that SSC already + // uses to manage the cluster. Best-effort: a failure here must not fail an + // otherwise-successful mission. See + // https://github.com/stellar/supercluster/issues/399. member self.DumpPeerMetrics(p: Peer) = - let destination = self.NetworkCfg.missionContext.destination - let name = p.PodName - destination.WriteString(sprintf "%s.metrics.json" name.StringName) (p.GetRawMetrics()) + try + let ns = self.NetworkCfg.NamespaceProperty + let name = self.NetworkCfg.PodName p.coreSet p.peerNum + let metricsUrl = sprintf "http://localhost:%d/metrics" CfgVal.httpPort + + self.sleepUntilNextRateLimitedApiCallTime () + + let muxedStream = + self + .Kube + .MuxedStreamNamespacedPodExecAsync(name = name.StringName, + ``namespace`` = ns, + command = [| "curl"; "-sf"; metricsUrl |], + container = "stellar-core-run", + tty = false, + cancellationToken = CancellationToken()) + .GetAwaiter() + .GetResult() + + let stdOut = + muxedStream.GetStream(Nullable(ChannelIndex.StdOut), Nullable()) + + let error = + muxedStream.GetStream(Nullable(ChannelIndex.Error), Nullable()) + + let errorReader = new StreamReader(error) + + LogInfo "Dumping metrics of peer %s" name.StringName + muxedStream.Start() + self.Destination.WriteStream(sprintf "%s.metrics.json" name.StringName) stdOut + let errors = errorReader.ReadToEndAsync().GetAwaiter().GetResult() + let returnMessage = SafeJsonConvert.DeserializeObject(errors) + Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore + with x -> LogWarn "Failed to dump metrics of peer %s: %s" p.PodName.StringName x.Message member self.DumpPeerData(p: Peer) = self.DumpPeerLogs p diff --git a/src/FSLibrary/StellarMissionContext.fs b/src/FSLibrary/StellarMissionContext.fs index 5152e2c4..c64b4cca 100644 --- a/src/FSLibrary/StellarMissionContext.fs +++ b/src/FSLibrary/StellarMissionContext.fs @@ -59,7 +59,6 @@ type MissionContext = ingressInternalDomain: string ingressExternalHost: string option ingressExternalPort: int - metricsViaClusterDns: bool exportToPrometheus: bool probeTimeout: int coreResources: CoreResources From d2e7d067c2927ebf69df85517d45477a275e080a Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 14:58:31 -0400 Subject: [PATCH 5/8] Route all stellar-core HTTP through pod-exec when --core-http-via-pod-exec set The teardown metrics dump was the only thing using pod-exec; in-mission health checks (WaitUntilReady -> GetInfo, WaitUntilSynced, WaitFor*) and actions (SetUpgrades, GenerateLoad, surveys, tx submit) still went through the ingress .local hostname. On an out-of-cluster runner (e.g. Namespace/nsc k3s) that hostname is unreachable, so SSC could not verify readiness even though core reached consensus -- the same DNS problem surfacing earlier than teardown. Funnel every core admin-HTTP GET through a single `Peer.httpGet path query` that selects the transport: - default: ingress (unchanged), preserving SDF behavior and the fast in-mission polling hot path (exec-per-poll would hammer the k8s API); - --core-http-via-pod-exec: exec `curl http://localhost:11626/?` inside the pod via the Kubernetes API, which needs no DNS and works from any runner that can reach the API server. Exec failures are surfaced as WebException so the existing WebExceptionRetry wrappers keep retrying transient errors (e.g. core still booting) uniformly. DumpPeerMetrics now just calls the transport-aware GetRawMetrics (still best-effort). All ~15 fetch/RequestString call sites route through httpGet, so nothing falls back to the ingress when the flag is on. CI: add a second BootAndSync run with --core-http-via-pod-exec and no resolvable --ingress-external-host, so any access that leaks to the ingress fails with name resolution -- proving full pod-exec coverage (mirrors an out-of-cluster nsc runner). Fixes #399 Co-Authored-By: Claude Opus 4.8 --- .github/workflows/build-and-test.yml | 8 ++ src/App/Program.fs | 8 ++ src/FSLibrary.Tests/Tests.fs | 1 + src/FSLibrary/StellarCoreHTTP.fs | 164 +++++++++++++------------ src/FSLibrary/StellarDataDump.fs | 45 +------ src/FSLibrary/StellarMissionContext.fs | 1 + 6 files changed, 108 insertions(+), 119 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 6c5d8acd..57c1de83 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -37,6 +37,14 @@ jobs: run: dotnet test --no-restore --verbosity normal - name: Run BootAndSync mission run: dotnet run --project src/App/App.fsproj --configuration Release -- mission BootAndSync --image stellar/stellar-core:stable --kubeconfig $KUBECONFIG --namespace default --ingress-class nginx --ingress-internal-domain local --ingress-external-host localhost --uneven-sched + # Exercise the pod-exec HTTP transport (--core-http-via-pod-exec) with no + # resolvable ingress host: the ingress host falls back to .local, + # which does not resolve from the runner. If any core HTTP access leaked + # back to the ingress, this run would fail with a name-resolution error, so + # a green run proves all access goes through pod exec. This mirrors an + # out-of-cluster runner (e.g. Namespace/nsc k3s). See issue #399. + - name: Run BootAndSync mission via pod-exec + run: dotnet run --project src/App/App.fsproj --configuration Release -- mission BootAndSync --image stellar/stellar-core:stable --kubeconfig $KUBECONFIG --namespace default --ingress-class nginx --ingress-internal-domain local --core-http-via-pod-exec --uneven-sched - uses: actions/upload-artifact@v4 with: name: destination diff --git a/src/App/Program.fs b/src/App/Program.fs index 83ade839..0a0b7577 100644 --- a/src/App/Program.fs +++ b/src/App/Program.fs @@ -46,6 +46,7 @@ type MissionOptions ingressInternalDomain: string, ingressExternalHost: string option, ingressExternalPort: int, + coreHttpViaPodExec: bool, exportToPrometheus: bool, probeTimeout: int, missions: string seq, @@ -178,6 +179,12 @@ type MissionOptions Default = 80)>] member self.IngressExternalPort = ingressExternalPort + [] + member self.CoreHttpViaPodExec = coreHttpViaPodExec + [] member self.ExportToPrometheus : bool = exportToPrometheus @@ -779,6 +786,7 @@ let main argv = ingressInternalDomain = mission.IngressInternalDomain ingressExternalHost = mission.IngressExternalHost ingressExternalPort = mission.IngressExternalPort + coreHttpViaPodExec = mission.CoreHttpViaPodExec exportToPrometheus = mission.ExportToPrometheus probeTimeout = mission.ProbeTimeout coreResources = SmallTestResources diff --git a/src/FSLibrary.Tests/Tests.fs b/src/FSLibrary.Tests/Tests.fs index ce280d8b..6b54165b 100644 --- a/src/FSLibrary.Tests/Tests.fs +++ b/src/FSLibrary.Tests/Tests.fs @@ -58,6 +58,7 @@ let ctx : MissionContext = ingressInternalDomain = "local" ingressExternalHost = None ingressExternalPort = 80 + coreHttpViaPodExec = false exportToPrometheus = false probeTimeout = 10 coreResources = SmallTestResources diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index 66d58fae..f7a7657e 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -5,12 +5,16 @@ module StellarCoreHTTP open FSharp.Data +open k8s +open k8s.Models +open Microsoft.Rest.Serialization open StellarCoreSet open PollRetry open Logging open StellarMissionContext open StellarNetworkCfg open StellarCorePeer +open System.IO open System.Threading open StellarDotnetSdk.Transactions open StellarDotnetSdk.Responses.Results @@ -388,9 +392,71 @@ type Peer with self.PodName.StringName path - member self.fetch(path: string) : string = - let url = self.URL path - Http.RequestString(url, headers = self.Headers) + // Reach stellar-core's admin HTTP endpoint by exec'ing `curl` inside the + // pod (via the Kubernetes API) rather than over the ingress. This is used + // when --core-http-via-pod-exec is set, for environments where the ingress + // hostname is not reachable from where SSC runs (e.g. a runner outside a + // non-SDF k3s cluster). Failures are surfaced as WebException so the + // existing WebExceptionRetry wrappers retry transient errors (e.g. core + // still booting) just as they do for ingress fetches. + // See https://github.com/stellar/supercluster/issues/399. + member self.fetchViaPodExec (path: string) (query: (string * string) list) : string = + let kube = self.networkCfg.missionContext.kube + let ns = self.networkCfg.NamespaceProperty + let name = self.PodName + + let encode (s: string) : string = System.Uri.EscapeDataString s + + let queryString = + match query with + | [] -> "" + | _ -> + query + |> List.map (fun (k, v) -> sprintf "%s=%s" (encode k) (encode v)) + |> String.concat "&" + |> sprintf "?%s" + + let url = sprintf "http://localhost:%d/%s%s" StellarCoreCfg.CfgVal.httpPort path queryString + + try + let muxedStream = + kube + .MuxedStreamNamespacedPodExecAsync(name = name.StringName, + ``namespace`` = ns, + command = [| "curl"; "-sf"; url |], + container = "stellar-core-run", + tty = false, + cancellationToken = CancellationToken()) + .GetAwaiter() + .GetResult() + + let stdOut = + muxedStream.GetStream(System.Nullable(ChannelIndex.StdOut), System.Nullable()) + + let error = + muxedStream.GetStream(System.Nullable(ChannelIndex.Error), System.Nullable()) + + let outReader = new StreamReader(stdOut) + let errReader = new StreamReader(error) + muxedStream.Start() + let outStr = outReader.ReadToEndAsync().GetAwaiter().GetResult() + let errStr = errReader.ReadToEndAsync().GetAwaiter().GetResult() + let returnMessage = SafeJsonConvert.DeserializeObject(errStr) + Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore + outStr + with + | :? System.Net.WebException -> reraise () + | e -> raise (System.Net.WebException(sprintf "pod-exec fetch of /%s failed: %s" path e.Message)) + + // Single point through which all stellar-core admin HTTP GETs flow, so the + // transport (ingress vs. in-pod curl) is chosen in one place. + member self.httpGet (path: string) (query: (string * string) list) : string = + if self.networkCfg.missionContext.coreHttpViaPodExec then + self.fetchViaPodExec path query + else + Http.RequestString(url = self.URL path, httpMethod = "GET", headers = self.Headers, query = query) + + member self.fetch(path: string) : string = self.httpGet path [] member self.GetState() = self.GetInfo().State @@ -486,15 +552,7 @@ type Peer with member self.SetUpgrades(upgrades: UpgradeParameters) = let res = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString( - httpMethod = "GET", - url = self.URL "upgrades", - headers = self.Headers, - query = upgrades.ToQuery - )) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "upgrades" upgrades.ToQuery) if res.ToLower().Contains("exception") then raise (PeerRejectedUpgradesException res) @@ -684,68 +742,38 @@ type Peer with raise (ProtocolVersionNotUpgradedException(currentProtocolVersion, lastestProtocolVersion)) member self.ClearMetrics() = - WebExceptionRetry - DefaultRetry - (fun _ -> Http.RequestString(httpMethod = "GET", headers = self.Headers, url = self.URL "clearmetrics")) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "clearmetrics" []) |> ignore member self.ToggleOverlayOnlyMode() = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString(httpMethod = "GET", headers = self.Headers, url = self.URL "toggleoverlayonlymode")) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "toggleoverlayonlymode" []) member self.GetTestAcc(accName: string) : TestAcc.Root = // NB: work around buggy JSON parser upstream, see // https://github.com/fsharp/FSharp.Data/pull/1262 let s = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString( - httpMethod = "GET", - url = self.URL("testacc"), - headers = self.Headers, - query = [ ("name", accName) ] - )) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "testacc" [ ("name", accName) ]) TestAcc.Parse(if s.Trim().StartsWith("null") then "{}" else s) member self.StartSurveyCollecting(nonce: int) = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString( - httpMethod = "GET", - url = self.URL("startsurveycollecting"), - headers = self.Headers, - query = [ ("nonce", nonce.ToString()) ] - )) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "startsurveycollecting" [ ("nonce", nonce.ToString()) ]) member self.StopSurveyCollecting() = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString(httpMethod = "GET", url = self.URL("stopsurveycollecting"), headers = self.Headers)) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "stopsurveycollecting" []) member self.SurveyTopologyTimeSliced (node: string) (inboundPeersIndex: int) (outboundPeersIndex: int) = WebExceptionRetry DefaultRetry (fun _ -> - Http.RequestString( - httpMethod = "GET", - url = self.URL("surveytopologytimesliced"), - headers = self.Headers, - query = - [ ("node", node) - ("inboundpeerindex", inboundPeersIndex.ToString()) - ("outboundpeerindex", outboundPeersIndex.ToString()) ] - )) + self.httpGet + "surveytopologytimesliced" + [ ("node", node) + ("inboundpeerindex", inboundPeersIndex.ToString()) + ("outboundpeerindex", outboundPeersIndex.ToString()) ]) member self.GetSurveyResult() = - WebExceptionRetry - DefaultRetry - (fun _ -> Http.RequestString(httpMethod = "GET", url = self.URL("getsurveyresult"), headers = self.Headers)) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "getsurveyresult" []) member self.GetTestAccBalance(accName: string) : int64 = RetryUntilSome @@ -758,22 +786,12 @@ type Peer with (fun _ -> LogWarn "Waiting for account %s to exist, to read seqnum" accName) member self.GenerateLoad(loadGen: LoadGen) : string = - WebExceptionRetry - DefaultRetry - (fun _ -> - Http.RequestString( - httpMethod = "GET", - headers = self.Headers, - url = self.URL "generateload", - query = loadGen.ToQuery - )) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "generateload" loadGen.ToQuery) member self.StopLoadGen() : string = self.GenerateLoad { LoadGen.GetDefault() with mode = StopRun } member self.ManualClose() = - WebExceptionRetry - DefaultRetry - (fun _ -> Http.RequestString(httpMethod = "GET", headers = self.Headers, url = self.URL "manualclose")) + WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "manualclose" []) |> ignore member self.SubmitSignedTransaction(tx: Transaction) : Tx.Root = @@ -783,20 +801,8 @@ type Peer with WebExceptionRetry DefaultRetry (fun _ -> - LogDebug - "Submitting transaction: %s" - ((self.URL "tx") + "?blob=" + (System.Uri.EscapeDataString b64)) - - let response = - Http.RequestStream( - (self.URL "tx"), - httpMethod = "GET", - query = [ "blob", b64 ], - headers = [ "Host", self.networkCfg.IngressInternalHostName ] - ) - - use reader = new System.IO.StreamReader(response.ResponseStream) - reader.ReadToEnd()) + LogDebug "Submitting transaction with blob: %s" b64 + self.httpGet "tx" [ ("blob", b64) ]) LogDebug "Transaction response: %s" s let res = Tx.Parse(s) diff --git a/src/FSLibrary/StellarDataDump.fs b/src/FSLibrary/StellarDataDump.fs index e7d6dd08..9a4688aa 100644 --- a/src/FSLibrary/StellarDataDump.fs +++ b/src/FSLibrary/StellarDataDump.fs @@ -240,48 +240,13 @@ type StellarFormation with Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore with x -> () - // Scrape post-mission metrics by exec'ing `curl` against stellar-core's - // admin HTTP endpoint on the pod loopback, rather than fetching through the - // ingress. The ingress hostname (e.g. .local) is not resolvable from - // every environment SSC runs in (for example a runner outside a non-SDF k3s - // cluster), whereas pod exec only needs the Kubernetes API that SSC already - // uses to manage the cluster. Best-effort: a failure here must not fail an - // otherwise-successful mission. See - // https://github.com/stellar/supercluster/issues/399. + // Best-effort: a failed metrics scrape must not fail an otherwise-successful + // mission (the impact reported in #399). The transport used to reach core's + // admin HTTP endpoint -- ingress or in-pod `curl` -- is selected by + // GetRawMetrics based on the --core-http-via-pod-exec flag. member self.DumpPeerMetrics(p: Peer) = try - let ns = self.NetworkCfg.NamespaceProperty - let name = self.NetworkCfg.PodName p.coreSet p.peerNum - let metricsUrl = sprintf "http://localhost:%d/metrics" CfgVal.httpPort - - self.sleepUntilNextRateLimitedApiCallTime () - - let muxedStream = - self - .Kube - .MuxedStreamNamespacedPodExecAsync(name = name.StringName, - ``namespace`` = ns, - command = [| "curl"; "-sf"; metricsUrl |], - container = "stellar-core-run", - tty = false, - cancellationToken = CancellationToken()) - .GetAwaiter() - .GetResult() - - let stdOut = - muxedStream.GetStream(Nullable(ChannelIndex.StdOut), Nullable()) - - let error = - muxedStream.GetStream(Nullable(ChannelIndex.Error), Nullable()) - - let errorReader = new StreamReader(error) - - LogInfo "Dumping metrics of peer %s" name.StringName - muxedStream.Start() - self.Destination.WriteStream(sprintf "%s.metrics.json" name.StringName) stdOut - let errors = errorReader.ReadToEndAsync().GetAwaiter().GetResult() - let returnMessage = SafeJsonConvert.DeserializeObject(errors) - Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore + self.Destination.WriteString (sprintf "%s.metrics.json" p.PodName.StringName) (p.GetRawMetrics()) with x -> LogWarn "Failed to dump metrics of peer %s: %s" p.PodName.StringName x.Message member self.DumpPeerData(p: Peer) = diff --git a/src/FSLibrary/StellarMissionContext.fs b/src/FSLibrary/StellarMissionContext.fs index c64b4cca..88883a4c 100644 --- a/src/FSLibrary/StellarMissionContext.fs +++ b/src/FSLibrary/StellarMissionContext.fs @@ -59,6 +59,7 @@ type MissionContext = ingressInternalDomain: string ingressExternalHost: string option ingressExternalPort: int + coreHttpViaPodExec: bool exportToPrometheus: bool probeTimeout: int coreResources: CoreResources From b04dc2c96075261ea1e7262b89376090b686f696 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 15:07:12 -0400 Subject: [PATCH 6/8] Apply fantomas formatting Co-Authored-By: Claude Opus 4.8 --- src/FSLibrary/StellarCoreHTTP.fs | 16 +++++++++++----- src/FSLibrary/StellarDataDump.fs | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index f7a7657e..7dd83554 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -416,7 +416,8 @@ type Peer with |> String.concat "&" |> sprintf "?%s" - let url = sprintf "http://localhost:%d/%s%s" StellarCoreCfg.CfgVal.httpPort path queryString + let url = + sprintf "http://localhost:%d/%s%s" StellarCoreCfg.CfgVal.httpPort path queryString try let muxedStream = @@ -431,10 +432,16 @@ type Peer with .GetResult() let stdOut = - muxedStream.GetStream(System.Nullable(ChannelIndex.StdOut), System.Nullable()) + muxedStream.GetStream( + System.Nullable(ChannelIndex.StdOut), + System.Nullable() + ) let error = - muxedStream.GetStream(System.Nullable(ChannelIndex.Error), System.Nullable()) + muxedStream.GetStream( + System.Nullable(ChannelIndex.Error), + System.Nullable() + ) let outReader = new StreamReader(stdOut) let errReader = new StreamReader(error) @@ -772,8 +779,7 @@ type Peer with ("inboundpeerindex", inboundPeersIndex.ToString()) ("outboundpeerindex", outboundPeersIndex.ToString()) ]) - member self.GetSurveyResult() = - WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "getsurveyresult" []) + member self.GetSurveyResult() = WebExceptionRetry DefaultRetry (fun _ -> self.httpGet "getsurveyresult" []) member self.GetTestAccBalance(accName: string) : int64 = RetryUntilSome diff --git a/src/FSLibrary/StellarDataDump.fs b/src/FSLibrary/StellarDataDump.fs index 9a4688aa..973bfe00 100644 --- a/src/FSLibrary/StellarDataDump.fs +++ b/src/FSLibrary/StellarDataDump.fs @@ -246,7 +246,7 @@ type StellarFormation with // GetRawMetrics based on the --core-http-via-pod-exec flag. member self.DumpPeerMetrics(p: Peer) = try - self.Destination.WriteString (sprintf "%s.metrics.json" p.PodName.StringName) (p.GetRawMetrics()) + self.Destination.WriteString(sprintf "%s.metrics.json" p.PodName.StringName) (p.GetRawMetrics()) with x -> LogWarn "Failed to dump metrics of peer %s: %s" p.PodName.StringName x.Message member self.DumpPeerData(p: Peer) = From c97458bd25b02dadf56bf51ab9c8b6befe417fff Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 15:14:00 -0400 Subject: [PATCH 7/8] Retry pod-exec fetch on empty response (core still booting) During WaitUntilReady, curl against a not-yet-ready core can return an empty body with exit 0, which previously reached Info.Parse and threw a non-retryable JSON error, crashing the mission. Surface an empty response as a WebException so the existing WebExceptionRetry retries until core is ready, matching how the ingress path retries on the 5xx it gets while core boots. Co-Authored-By: Claude Opus 4.8 --- src/FSLibrary/StellarCoreHTTP.fs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index 7dd83554..3206484a 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -450,6 +450,15 @@ type Peer with let errStr = errReader.ReadToEndAsync().GetAwaiter().GetResult() let returnMessage = SafeJsonConvert.DeserializeObject(errStr) Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore + + // While core is still booting its admin HTTP server can answer with + // an empty body (curl exits 0). Treat that like a transient HTTP + // error so WebExceptionRetry retries rather than handing an empty + // string to a JSON parser. No core admin endpoint returns an empty + // body on success. + if System.String.IsNullOrEmpty outStr then + raise (System.Net.WebException(sprintf "pod-exec fetch of /%s returned an empty response" path)) + outStr with | :? System.Net.WebException -> reraise () From b50ec33e1625d9fd1618aef94a677a26f41906b7 Mon Sep 17 00:00:00 2001 From: tomerweller Date: Thu, 18 Jun 2026 15:23:38 -0400 Subject: [PATCH 8/8] Only require non-empty response on JSON reads, not all endpoints The previous empty-response guard lived in the transport and broke /upgrades, which legitimately returns an empty body on success (SetUpgrades only checks the body for "exception"). Move the empty-is-not-ready check into a fetchNonEmpty helper used by the JSON-parsing reads (info, metrics, sorobaninfo) only; action endpoints keep using fetch/httpGet and tolerate an empty body. Co-Authored-By: Claude Opus 4.8 --- src/FSLibrary/StellarCoreHTTP.fs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/FSLibrary/StellarCoreHTTP.fs b/src/FSLibrary/StellarCoreHTTP.fs index 3206484a..8315d9fc 100644 --- a/src/FSLibrary/StellarCoreHTTP.fs +++ b/src/FSLibrary/StellarCoreHTTP.fs @@ -450,15 +450,6 @@ type Peer with let errStr = errReader.ReadToEndAsync().GetAwaiter().GetResult() let returnMessage = SafeJsonConvert.DeserializeObject(errStr) Kubernetes.GetExitCodeOrThrow(returnMessage) |> ignore - - // While core is still booting its admin HTTP server can answer with - // an empty body (curl exits 0). Treat that like a transient HTTP - // error so WebExceptionRetry retries rather than handing an empty - // string to a JSON parser. No core admin endpoint returns an empty - // body on success. - if System.String.IsNullOrEmpty outStr then - raise (System.Net.WebException(sprintf "pod-exec fetch of /%s returned an empty response" path)) - outStr with | :? System.Net.WebException -> reraise () @@ -474,6 +465,20 @@ type Peer with member self.fetch(path: string) : string = self.httpGet path [] + // For endpoints that must return a (JSON) body. While core is still booting, + // its admin HTTP server can answer with an empty body (and curl exits 0); + // surface that as a WebException so the JSON-parsing callers' retry loops + // wait for core rather than feeding an empty string to a parser. Endpoints + // that legitimately return an empty body on success (e.g. upgrades) keep + // using fetch/httpGet directly. + member self.fetchNonEmpty(path: string) : string = + let resp = self.fetch path + + if System.String.IsNullOrEmpty resp then + raise (System.Net.WebException(sprintf "core returned an empty response for /%s, not ready yet" path)) + + resp + member self.GetState() = self.GetInfo().State member self.GetStatusOrState() : string = @@ -481,15 +486,15 @@ type Peer with if i.Status.Length = 0 then i.State else i.Status.[0] member self.GetMetrics() : Metrics.Metrics = - WebExceptionRetry DefaultRetry (fun _ -> Metrics.Parse(self.fetch "metrics").Metrics) + WebExceptionRetry DefaultRetry (fun _ -> Metrics.Parse(self.fetchNonEmpty "metrics").Metrics) - member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetch "metrics") + member self.GetRawMetrics() = WebExceptionRetry DefaultRetry (fun _ -> self.fetchNonEmpty "metrics") member self.GetInfo() : Info.Info = WebExceptionRetry DefaultRetry (fun _ -> - let resp = self.fetch "info" + let resp = self.fetchNonEmpty "info" let parsed = Info.Parse(resp) // stellar-core can respond with {"error":"Core is booting, try again later"} @@ -498,7 +503,7 @@ type Peer with | None -> raise (System.Net.WebException("Core is not ready, info property missing"))) member self.GetSorobanInfo() : SorobanInfo.Root = - WebExceptionRetry DefaultRetry (fun _ -> SorobanInfo.Parse(self.fetch "sorobaninfo")) + WebExceptionRetry DefaultRetry (fun _ -> SorobanInfo.Parse(self.fetchNonEmpty "sorobaninfo")) member self.GetLedgerNum() : int = self.GetInfo().Ledger.Num