diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3cd0de8..db6d805 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,6 +26,9 @@ jobs: - name: Install deps and build run: npm ci && npm run build + - name: Run unit tests + run: npm test + - name: Setup kosli CLI id: setup uses: ./ diff --git a/src/index.js b/src/index.js index aaadfdf..66c5c45 100644 --- a/src/index.js +++ b/src/index.js @@ -2,6 +2,7 @@ import os from "os"; import * as core from "@actions/core"; import * as tc from "@actions/tool-cache"; import { getDownloadUrl, resolveVersion } from "./download.js"; +import { withRetries } from "./retry.js"; async function setup() { try { @@ -10,13 +11,21 @@ async function setup() { const platform = os.platform(); const arch = os.arch(); - const resolvedVersion = await resolveVersion(version, token); + const resolvedVersion = version === "latest" + ? await withRetries( + () => resolveVersion(version, token), + { onRetry: logRetry("resolving latest version") } + ) + : version; let pathToCLI = tc.find("kosli", resolvedVersion); if (!pathToCLI) { const downloadUrl = getDownloadUrl({ version: resolvedVersion, platform, arch }); console.log(`installing Kosli CLI from ${downloadUrl} ...`); - const pathToTarball = await tc.downloadTool(downloadUrl); + const pathToTarball = await withRetries( + () => tc.downloadTool(downloadUrl), + { onRetry: logRetry("downloading Kosli CLI") } + ); const extracted = await tc.extractTar(pathToTarball); pathToCLI = await tc.cacheDir(extracted, "kosli", resolvedVersion); } else { @@ -31,4 +40,12 @@ async function setup() { } } +function logRetry(label) { + return ({ attempt, retries, delayMs, error }) => { + core.warning( + `${label} failed (attempt ${attempt}/${retries}): ${error.message}. Retrying in ${delayMs}ms.` + ); + }; +} + setup(); diff --git a/src/retry.js b/src/retry.js new file mode 100644 index 0000000..89fb119 --- /dev/null +++ b/src/retry.js @@ -0,0 +1,71 @@ +// ECONNRESET / ETIMEDOUT / EAI_AGAIN / EPIPE are clearly transient. +// ENOTFOUND and ECONNREFUSED are deliberately excluded - the download URL is +// hardcoded by this action, so those codes indicate a hard GitHub outage or +// DNS-wide failure where retrying just delays the inevitable. +const TRANSIENT_NETWORK_CODES = new Set([ + "ECONNRESET", + "ETIMEDOUT", + "EAI_AGAIN", + "EPIPE" +]); + +export function isTransientError(err) { + if (!err) return false; + if (typeof err.httpStatusCode === "number") { + return err.httpStatusCode >= 500 || err.httpStatusCode === 429 || err.httpStatusCode === 408; + } + if (typeof err.status === "number") { + return err.status >= 500 || err.status === 429 || err.status === 408; + } + if (typeof err.code === "string" && TRANSIENT_NETWORK_CODES.has(err.code)) { + return true; + } + if (typeof err.message === "string" && /Unexpected HTTP response:\s*(5\d\d|429|408)/.test(err.message)) { + return true; + } + return false; +} + +// Full-jitter exponential backoff: chosen uniformly from [0, min(maxDelayMs, base * factor^attempt)]. +// See AWS Architecture Blog "Exponential Backoff And Jitter". Jitter prevents many parallel CI +// jobs from hammering GitHub in lockstep when an outage clears. +export function computeBackoff(attempt, { baseDelayMs, factor, maxDelayMs }, random = Math.random) { + const exp = baseDelayMs * Math.pow(factor, attempt); + const capped = Math.min(maxDelayMs, exp); + return Math.floor(random() * capped); +} + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +// Defaults are tuned to stack on top of @actions/tool-cache's own internal +// retries (3 attempts, ~10-20s waits) without ballooning total time-to-fail. +// With these defaults the outer layer adds at most ~14s of jittered waits. +export async function withRetries(fn, options = {}) { + const { + retries = 3, + baseDelayMs = 2000, + factor = 2, + maxDelayMs = 15000, + shouldRetry = isTransientError, + onRetry = () => {}, + sleeper = sleep, + random = Math.random + } = options; + + let attempt = 0; + while (true) { + try { + return await fn(attempt); + } catch (err) { + if (attempt >= retries || !shouldRetry(err)) { + throw err; + } + const delay = computeBackoff(attempt, { baseDelayMs, factor, maxDelayMs }, random); + onRetry({ attempt: attempt + 1, retries, delayMs: delay, error: err }); + await sleeper(delay); + attempt++; + } + } +} diff --git a/test/retry.test.js b/test/retry.test.js new file mode 100644 index 0000000..9f0cacc --- /dev/null +++ b/test/retry.test.js @@ -0,0 +1,132 @@ +import test from "ava"; +import { withRetries, isTransientError, computeBackoff } from "../src/retry.js"; + +const noSleep = () => Promise.resolve(); +const fixedRandom = () => 0.5; + +test("returns the result when fn succeeds on first attempt", async t => { + let calls = 0; + const result = await withRetries(async () => { calls++; return "ok"; }, { sleeper: noSleep }); + t.is(result, "ok"); + t.is(calls, 1); +}); + +test("retries transient errors and eventually succeeds", async t => { + let calls = 0; + const result = await withRetries( + async () => { + calls++; + if (calls < 3) { + const err = new Error("Unexpected HTTP response: 504"); + err.httpStatusCode = 504; + throw err; + } + return "ok"; + }, + { sleeper: noSleep, random: fixedRandom } + ); + t.is(result, "ok"); + t.is(calls, 3); +}); + +test("does not retry non-transient errors", async t => { + let calls = 0; + const err = new Error("Unexpected HTTP response: 404"); + err.httpStatusCode = 404; + await t.throwsAsync( + withRetries(async () => { calls++; throw err; }, { sleeper: noSleep }), + { message: /404/ } + ); + t.is(calls, 1); +}); + +test("throws after exhausting retries", async t => { + let calls = 0; + const err = new Error("Unexpected HTTP response: 503"); + err.httpStatusCode = 503; + await t.throwsAsync( + withRetries(async () => { calls++; throw err; }, { retries: 2, sleeper: noSleep, random: fixedRandom }), + { message: /503/ } + ); + t.is(calls, 3); // initial + 2 retries +}); + +test("invokes onRetry with attempt metadata", async t => { + const events = []; + let calls = 0; + await withRetries( + async () => { + calls++; + if (calls < 2) { + const err = new Error("Unexpected HTTP response: 502"); + err.httpStatusCode = 502; + throw err; + } + return "ok"; + }, + { + retries: 3, + baseDelayMs: 1000, + factor: 2, + maxDelayMs: 60000, + sleeper: noSleep, + random: fixedRandom, + onRetry: e => events.push(e) + } + ); + t.is(events.length, 1); + t.is(events[0].attempt, 1); + t.is(events[0].retries, 3); + t.is(events[0].delayMs, 500); // floor(0.5 * 1000) + t.is(events[0].error.httpStatusCode, 502); +}); + +test("isTransientError identifies HTTP 5xx, 429, 408", t => { + t.true(isTransientError({ httpStatusCode: 500 })); + t.true(isTransientError({ httpStatusCode: 504 })); + t.true(isTransientError({ httpStatusCode: 429 })); + t.true(isTransientError({ httpStatusCode: 408 })); + t.false(isTransientError({ httpStatusCode: 404 })); + t.false(isTransientError({ httpStatusCode: 401 })); +}); + +test("isTransientError identifies Node network error codes", t => { + t.true(isTransientError({ code: "ECONNRESET" })); + t.true(isTransientError({ code: "ETIMEDOUT" })); + t.true(isTransientError({ code: "EAI_AGAIN" })); + t.true(isTransientError({ code: "EPIPE" })); + t.false(isTransientError({ code: "EACCES" })); +}); + +test("isTransientError excludes ENOTFOUND and ECONNREFUSED (treated as hard failures)", t => { + t.false(isTransientError({ code: "ENOTFOUND" })); + t.false(isTransientError({ code: "ECONNREFUSED" })); +}); + +test("isTransientError matches HTTP response codes in error messages", t => { + t.true(isTransientError(new Error("Unexpected HTTP response: 504"))); + t.true(isTransientError(new Error("Unexpected HTTP response: 502"))); + t.false(isTransientError(new Error("Unexpected HTTP response: 404"))); + t.false(isTransientError(new Error("some random parse error"))); +}); + +test("isTransientError handles null/undefined safely", t => { + t.false(isTransientError(null)); + t.false(isTransientError(undefined)); +}); + +test("computeBackoff respects cap and applies jitter", t => { + // attempt 0, base 1000, factor 2, cap 60000, random=1 → just under 1000 + const d0 = computeBackoff(0, { baseDelayMs: 1000, factor: 2, maxDelayMs: 60000 }, () => 0.999); + t.true(d0 < 1000); + t.true(d0 >= 0); + + // attempt 10 with base 1000, factor 2 → 1024000ms, capped at 60000 + const d10 = computeBackoff(10, { baseDelayMs: 1000, factor: 2, maxDelayMs: 60000 }, () => 0.999); + t.true(d10 < 60000); + t.true(d10 >= 59000); + + // random=0 always returns 0 + const d0min = computeBackoff(3, { baseDelayMs: 1000, factor: 2, maxDelayMs: 60000 }, () => 0); + t.is(d0min, 0); +}); diff --git a/workflows/test.yml b/workflows/test.yml deleted file mode 100644 index 44b215e..0000000 --- a/workflows/test.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Tests - -on: - push: - branches: - - main - pull_request: - -defaults: - run: - shell: bash - -jobs: - test: - name: Test - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [macos-latest, windows-latest, ubuntu-latest] - version: [0.1.14] - steps: - - name: Checkout - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - name: Setup kosli CLI - uses: ./ - with: - version: ${{ matrix.version }} - - - name: Capture kosli version installed - run: | - export KOSLI_VERSION=$( kosli version --short ) - echo 'KOSLI_VERSION_INSTALLED<> $GITHUB_ENV - kosli version --short >> $GITHUB_ENV - echo 'EOF' >> $GITHUB_ENV - - name: Verify - shell: python - env: - KOSLI_VERSION_EXPECTED: ${{ matrix.version }} - run: | - import sys, os - sys.exit( - int(not os.environ["KOSLI_VERSION_EXPECTED"] in os.environ["KOSLI_VERSION_INSTALLED"]) - )