diff --git a/backend/src/lib/trustline-manager.js b/backend/src/lib/trustline-manager.js index 5f743eb..8c8014e 100644 --- a/backend/src/lib/trustline-manager.js +++ b/backend/src/lib/trustline-manager.js @@ -1,25 +1,25 @@ /** * Trustline Manager - Enhanced cryptographic verification, rate limiting, error recovery, and optimized queries - * + * * This module provides comprehensive trustline management functionality with: * - Task #595: Cryptographic signature verification for trustline operations * - Task #594: Rate limiting for trustline operations - * - Task #597: Enhanced error recovery mechanisms + * - Task #746: Enhanced error recovery mechanisms * - Task #596: Optimized SQL queries for trustline data */ import { createHash } from "node:crypto"; import * as StellarSdk from "stellar-sdk"; import { queryWithRetry } from "./db.js"; -import { - isValidStellarAccountId, +import { + isValidStellarAccountId, verifyTransactionSignature, withHorizonRetry, - isValidAssetCode + isValidAssetCode, } from "./stellar.js"; -import { +import { createRedisRateLimitStore, - RATE_LIMIT_REDIS_PREFIX + RATE_LIMIT_REDIS_PREFIX, } from "./rate-limit.js"; import rateLimit, { ipKeyGenerator } from "express-rate-limit"; @@ -33,13 +33,24 @@ const MAX_RETRY_ATTEMPTS = 3; const RETRY_DELAY_BASE_MS = 1000; const CIRCUIT_BREAKER_THRESHOLD = 5; const CIRCUIT_BREAKER_TIMEOUT_MS = 30 * 1000; +const CIRCUIT_BREAKER_HALF_OPEN_PROBE_MS = 5 * 1000; // time before allowing probe in half-open +const OPERATION_TIMEOUT_MS = 15 * 1000; // default per-operation timeout +const DLQ_MAX_SIZE = 100; // maximum dead-letter queue entries -// Circuit breaker state -let circuitBreakerState = { - failures: 0, - lastFailureTime: null, - isOpen: false -}; +/** + * Per-context circuit breaker states. + * Key: context string (or 'default'). Value: CircuitBreakerState. + * + * Using a Map keeps failure domains isolated so a surge in one operation + * (e.g. Horizon calls) does not block unrelated DB queries. + */ +const circuitBreakerRegistry = new Map(); + +/** + * In-memory dead-letter queue for failed operations that exhausted retries. + * Entries are available for inspection, replay, or external alerting. + */ +const deadLetterQueue = []; /** * Task #595: Enhanced cryptographic signature verification for trustline operations @@ -274,7 +285,7 @@ export class TrustlineRateLimiter { legacyHeaders: false, validate: { ip: false }, keyGenerator: this.getTrustlineOperationKey, - requestWasSuccessful: (req, res) => res.statusCode < 400, + requestWasSuccessful: (_req, res) => res.statusCode < 400, store, passOnStoreError: true, // Skip rate limiting for high-tier merchants @@ -300,7 +311,7 @@ export class TrustlineRateLimiter { legacyHeaders: false, validate: { ip: false }, keyGenerator: this.getTrustlineVerificationKey, - requestWasSuccessful: (req, res) => res.statusCode < 400, + requestWasSuccessful: (_req, res) => res.statusCode < 400, store, passOnStoreError: true }); @@ -308,70 +319,271 @@ export class TrustlineRateLimiter { } /** - * Task #597: Enhanced error recovery for trustline operations - * - * Implements robust error recovery mechanisms: - * - Exponential backoff retry logic - * - Circuit breaker pattern - * - Graceful degradation - * - Comprehensive error classification + * Task #746: Enhanced error recovery for trustline operations + * + * Improvements over the previous implementation: + * - Per-context circuit breakers (failure domains are isolated) + * - Half-open state: one probe attempt before fully closing the breaker + * - Operation timeout wrapper (prevents runaway async calls) + * - Dead-letter queue for unrecoverable failures (replay / alerting) + * - Error metrics per context (failure counts, last error, recovery counts) + * - Pluggable fallback handlers so callers can return cached/degraded data + * - Comprehensive error classification unchanged but extended for auth errors */ export class TrustlineErrorRecovery { - + + // ─── Circuit Breaker Helpers ──────────────────────────────────────────────── + + static _getState(context) { + if (!circuitBreakerRegistry.has(context)) { + circuitBreakerRegistry.set(context, { + failures: 0, + lastFailureTime: null, + // States: 'closed' | 'open' | 'half-open' + state: 'closed', + successAfterHalfOpen: 0, + metrics: { + totalFailures: 0, + totalRecoveries: 0, + lastErrorMessage: null, + lastErrorTime: null, + }, + }); + } + return circuitBreakerRegistry.get(context); + } + + /** + * Returns the circuit breaker disposition for a given context. + * 'allow' – proceed normally + * 'probe' – one probe allowed (half-open state) + * 'reject' – circuit open, reject immediately + */ + static _circuitBreakerDisposition(context) { + const s = this._getState(context); + const now = Date.now(); + + if (s.state === 'closed') return 'allow'; + + if (s.state === 'open') { + const elapsed = now - s.lastFailureTime; + if (elapsed >= CIRCUIT_BREAKER_TIMEOUT_MS) { + // Transition to half-open: allow a single probe + s.state = 'half-open'; + s.successAfterHalfOpen = 0; + return 'probe'; + } + // Still within the open window + if (elapsed >= CIRCUIT_BREAKER_HALF_OPEN_PROBE_MS && s.state === 'open') { + return 'reject'; + } + return 'reject'; + } + + // half-open: allow probe only once + if (s.state === 'half-open') return 'probe'; + + return 'allow'; + } + + static _recordFailure(context, error) { + const s = this._getState(context); + s.failures++; + s.lastFailureTime = Date.now(); + s.metrics.totalFailures++; + s.metrics.lastErrorMessage = error?.message ?? String(error); + s.metrics.lastErrorTime = new Date().toISOString(); + + if (s.state === 'half-open') { + // Probe failed – reopen the circuit + s.state = 'open'; + } else if (s.failures >= CIRCUIT_BREAKER_THRESHOLD) { + s.state = 'open'; + } + } + + static _recordSuccess(context) { + const s = this._getState(context); + if (s.state === 'half-open') { + // Probe succeeded – close the circuit + s.state = 'closed'; + s.failures = 0; + s.metrics.totalRecoveries++; + } else { + s.failures = 0; + s.metrics.totalRecoveries++; + } + } + + // ─── Dead-Letter Queue ─────────────────────────────────────────────────────── + + static _pushToDeadLetterQueue(entry) { + if (deadLetterQueue.length >= DLQ_MAX_SIZE) { + deadLetterQueue.shift(); // evict oldest + } + deadLetterQueue.push({ + ...entry, + enqueuedAt: new Date().toISOString(), + }); + } + + /** Returns a shallow copy of the dead-letter queue for inspection. */ + static getDeadLetterQueue() { + return [...deadLetterQueue]; + } + + /** Drain (clear) the dead-letter queue and return its contents. */ + static drainDeadLetterQueue() { + return deadLetterQueue.splice(0, deadLetterQueue.length); + } + + // ─── Timeout Wrapper ───────────────────────────────────────────────────────── + + /** + * Wraps a promise with a hard timeout. + * Rejects with a timeout error if the operation takes longer than `ms`. + */ + static withTimeout(promise, ms = OPERATION_TIMEOUT_MS, label = 'operation') { + let timer; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => { + const err = new Error(`${label} timed out after ${ms}ms`); + err.isTimeout = true; + reject(err); + }, ms); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); + } + + // ─── Core Execution ────────────────────────────────────────────────────────── + /** - * Execute operation with enhanced error recovery + * Execute operation with enhanced error recovery. + * + * Options: + * timeoutMs – hard timeout per attempt (default: OPERATION_TIMEOUT_MS) + * fallback – async fn() called when all attempts fail; its return value + * is returned instead of throwing (graceful degradation) + * maxAttempts – override MAX_RETRY_ATTEMPTS for this call */ - static async executeWithRecovery(operation, context = "trustline operation") { - // Check circuit breaker - if (this.isCircuitBreakerOpen()) { - throw new Error(`Circuit breaker is open for ${context}. Service temporarily unavailable.`); + static async executeWithRecovery( + operation, + context = 'trustline operation', + { timeoutMs = OPERATION_TIMEOUT_MS, fallback = null, maxAttempts = MAX_RETRY_ATTEMPTS } = {}, + ) { + const disposition = this._circuitBreakerDisposition(context); + + if (disposition === 'reject') { + const cbError = new Error( + `Circuit breaker is open for "${context}". Service temporarily unavailable.`, + ); + cbError.isCircuitBreakerOpen = true; + cbError.status = 503; + + if (fallback) { + try { + return await fallback(cbError); + } catch (_) { /* fall through to throw */ } + } + throw cbError; } + // Half-open probes run exactly once (maxAttempts=1, no retry) + const effectiveMaxAttempts = disposition === 'probe' ? 1 : maxAttempts; + let lastError = null; - - for (let attempt = 1; attempt <= MAX_RETRY_ATTEMPTS; attempt++) { + + for (let attempt = 1; attempt <= effectiveMaxAttempts; attempt++) { try { - const result = await operation(); - - // Reset circuit breaker on success - this.resetCircuitBreaker(); - + const result = await this.withTimeout( + Promise.resolve().then(() => operation()), + timeoutMs, + context, + ); + + this._recordSuccess(context); return result; } catch (error) { lastError = error; - - // Classify error to determine if retry is appropriate + const errorClass = this.classifyError(error); - - if (!errorClass.retryable || attempt === MAX_RETRY_ATTEMPTS) { - this.recordFailure(); - throw this.enhanceError(error, context, attempt, errorClass); + + if (!errorClass.retryable || attempt === effectiveMaxAttempts) { + this._recordFailure(context, error); + const enhanced = this.enhanceError(error, context, attempt, errorClass); + + // Push to dead-letter queue for non-retryable terminal failures + if (!errorClass.retryable) { + this._pushToDeadLetterQueue({ + context, + errorType: errorClass.type, + errorMessage: error.message, + attempts: attempt, + }); + } + + if (fallback) { + try { + return await fallback(enhanced); + } catch (_) { /* fall through to throw */ } + } + throw enhanced; } - // Calculate delay with exponential backoff and jitter const delay = this.calculateRetryDelay(attempt, errorClass.priority); await this.sleep(delay); } } - this.recordFailure(); - throw this.enhanceError(lastError, context, MAX_RETRY_ATTEMPTS, this.classifyError(lastError)); + this._recordFailure(context, lastError); + const finalEnhanced = this.enhanceError( + lastError, + context, + effectiveMaxAttempts, + this.classifyError(lastError), + ); + + this._pushToDeadLetterQueue({ + context, + errorType: this.classifyError(lastError).type, + errorMessage: lastError?.message, + attempts: effectiveMaxAttempts, + }); + + if (fallback) { + try { + return await fallback(finalEnhanced); + } catch (_) { /* fall through to throw */ } + } + throw finalEnhanced; } + // ─── Error Classification ───────────────────────────────────────────────── + /** - * Classify errors for appropriate recovery strategy + * Classify errors for appropriate recovery strategy. */ static classifyError(error) { const message = error.message?.toLowerCase() || ''; const status = error.status || error.response?.status; + // Timeout errors - retryable + if (error.isTimeout || message.includes('timed out')) { + return { + type: 'timeout', + retryable: true, + priority: 'high', + reason: 'Operation timed out', + }; + } + // Database schema errors - not retryable if (message.includes('index already exists') || message.includes('relation already exists')) { return { type: 'db_schema_conflict', retryable: false, priority: 'none', - reason: 'Database schema conflict, such as an index already existing.' + reason: 'Database schema conflict, such as an index already existing.', }; } @@ -389,7 +601,7 @@ export class TrustlineErrorRecovery { type: 'network', retryable: true, priority: 'high', - reason: 'Network connectivity issue' + reason: 'Network connectivity issue', }; } @@ -399,17 +611,27 @@ export class TrustlineErrorRecovery { type: 'rate_limit', retryable: true, priority: 'low', - reason: 'Rate limit exceeded' + reason: 'Rate limit exceeded', + }; + } + + // Authentication/authorization errors - not retryable + if (status === 401 || status === 403) { + return { + type: 'auth_error', + retryable: false, + priority: 'none', + reason: 'Authentication or authorization failure', }; } // Horizon server errors - retryable - if (status >= 500 && status < 600) { + if (typeof status === 'number' && status >= 500 && status < 600) { return { type: 'server_error', retryable: true, priority: 'medium', - reason: 'Server error' + reason: 'Server error', }; } @@ -421,7 +643,7 @@ export class TrustlineErrorRecovery { type: 'asset_not_found', retryable: false, priority: 'none', - reason: 'Asset or account not found' + reason: 'Asset or account not found', }; } @@ -431,18 +653,18 @@ export class TrustlineErrorRecovery { type: 'insufficient_balance', retryable: false, priority: 'none', - reason: 'Insufficient balance for operation' + reason: 'Insufficient balance for operation', }; } } // Client errors (4xx) - generally not retryable - if (status >= 400 && status < 500) { + if (typeof status === 'number' && status >= 400 && status < 500) { return { type: 'client_error', retryable: false, priority: 'none', - reason: 'Client error - check request parameters' + reason: 'Client error - check request parameters', }; } @@ -451,78 +673,83 @@ export class TrustlineErrorRecovery { type: 'unknown', retryable: true, priority: 'low', - reason: 'Unknown error type' + reason: 'Unknown error type', }; } + // ─── Retry Delay ───────────────────────────────────────────────────────────── + /** - * Calculate retry delay with exponential backoff + * Calculate retry delay with exponential backoff and ±25 % jitter. */ static calculateRetryDelay(attempt, priority = 'medium') { - const baseDelay = RETRY_DELAY_BASE_MS; const multiplier = priority === 'high' ? 1 : priority === 'low' ? 3 : 2; - const exponentialDelay = baseDelay * Math.pow(2, attempt - 1) * multiplier; - - // Add jitter (±25%) + const exponentialDelay = RETRY_DELAY_BASE_MS * Math.pow(2, attempt - 1) * multiplier; const jitter = exponentialDelay * 0.25 * (Math.random() - 0.5); - - return Math.min(exponentialDelay + jitter, 30000); // Cap at 30 seconds + return Math.min(exponentialDelay + jitter, 30000); // cap at 30 s } - /** - * Enhanced error with recovery context - */ + // ─── Error Enhancement ──────────────────────────────────────────────────────── + static enhanceError(originalError, context, attempts, errorClass) { const enhanced = new Error( - `${context} failed after ${attempts} attempts: ${originalError.message} (${errorClass.reason})` + `${context} failed after ${attempts} attempt${attempts !== 1 ? 's' : ''}: ${originalError.message} (${errorClass.reason})`, ); - enhanced.originalError = originalError; enhanced.context = context; enhanced.attempts = attempts; enhanced.errorClass = errorClass; enhanced.status = originalError.status || 500; enhanced.recoverable = errorClass.retryable; - return enhanced; } + // ─── Public Circuit-Breaker API ─────────────────────────────────────────────── + /** - * Circuit breaker management + * Returns true when the circuit breaker for `context` is open (backwards compat). + * Pass no argument to check the legacy global context ('default'). */ - static isCircuitBreakerOpen() { - if (!circuitBreakerState.isOpen) { - return false; - } + static isCircuitBreakerOpen(context = 'default') { + return this._circuitBreakerDisposition(context) === 'reject'; + } - // Check if timeout has passed - const now = Date.now(); - if (now - circuitBreakerState.lastFailureTime > CIRCUIT_BREAKER_TIMEOUT_MS) { - circuitBreakerState.isOpen = false; - circuitBreakerState.failures = 0; - return false; + /** + * Manually reset the circuit breaker for a given context. + * Useful in tests and administrative endpoints. + */ + static resetCircuitBreaker(context = null) { + if (context) { + circuitBreakerRegistry.delete(context); + } else { + // Legacy behaviour: reset all breakers + circuitBreakerRegistry.clear(); } - - return true; } - static recordFailure() { - circuitBreakerState.failures++; - circuitBreakerState.lastFailureTime = Date.now(); - - if (circuitBreakerState.failures >= CIRCUIT_BREAKER_THRESHOLD) { - circuitBreakerState.isOpen = true; + /** Return a snapshot of all circuit breaker states (for health endpoints). */ + static getCircuitBreakerMetrics() { + const snapshot = {}; + for (const [ctx, state] of circuitBreakerRegistry.entries()) { + snapshot[ctx] = { + state: state.state, + failures: state.failures, + lastFailureTime: state.lastFailureTime, + metrics: { ...state.metrics }, + }; } + return snapshot; } - static resetCircuitBreaker() { - circuitBreakerState.failures = 0; - circuitBreakerState.isOpen = false; - circuitBreakerState.lastFailureTime = null; + // ─── Compat / Helpers ───────────────────────────────────────────────────────── + + /** Legacy compatibility: record a failure on the default context. */ + static recordFailure() { + this._recordFailure('default', null); } - static sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); + static async sleep(ms) { + await new Promise((resolve) => setTimeout(resolve, ms)); } } diff --git a/backend/src/lib/trustline-manager.test.js b/backend/src/lib/trustline-manager.test.js index 61d7d7a..e8ae863 100644 --- a/backend/src/lib/trustline-manager.test.js +++ b/backend/src/lib/trustline-manager.test.js @@ -72,7 +72,7 @@ import { isValidAssetCode } from './stellar.js'; import * as StellarSdk from 'stellar-sdk'; -import rateLimit from 'express-rate-limit'; +import _rateLimit from 'express-rate-limit'; describe('Trustline Manager - Task #595: Cryptographic Signature Verification', () => { let verifier; @@ -291,100 +291,350 @@ describe('Trustline Manager - Task #594: Rate Limiting', () => { }); }); -describe('Trustline Manager - Task #597: Error Recovery', () => { +describe('Trustline Manager - Task #746: Enhanced Error Recovery', () => { beforeEach(() => { - // Reset circuit breaker state + // Reset ALL per-context circuit breakers and drain the DLQ TrustlineErrorRecovery.resetCircuitBreaker(); + TrustlineErrorRecovery.drainDeadLetterQueue(); vi.clearAllMocks(); }); - describe('TrustlineErrorRecovery', () => { + describe('TrustlineErrorRecovery – basic execution', () => { test('should execute operation successfully on first try', async () => { const mockOperation = vi.fn().mockResolvedValue('success'); - const result = await TrustlineErrorRecovery.executeWithRecovery(mockOperation); - expect(result).toBe('success'); expect(mockOperation).toHaveBeenCalledTimes(1); }); test('should retry on retryable errors', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); const mockOperation = vi.fn() .mockRejectedValueOnce(new Error('network timeout')) .mockRejectedValueOnce(new Error('connection refused')) .mockResolvedValue('success'); - + const result = await TrustlineErrorRecovery.executeWithRecovery(mockOperation); - expect(result).toBe('success'); expect(mockOperation).toHaveBeenCalledTimes(3); + + vi.restoreAllMocks(); }); test('should not retry on non-retryable errors', async () => { const error = new Error('asset not found'); error.status = 404; const mockOperation = vi.fn().mockRejectedValue(error); - + await expect( - TrustlineErrorRecovery.executeWithRecovery(mockOperation) + TrustlineErrorRecovery.executeWithRecovery(mockOperation), ).rejects.toThrow('asset not found'); - + expect(mockOperation).toHaveBeenCalledTimes(1); }); + }); - test('should classify network errors as retryable', () => { + describe('TrustlineErrorRecovery – error classification', () => { + test('should classify network errors as retryable with high priority', () => { const networkError = new Error('network timeout'); - const classification = TrustlineErrorRecovery.classifyError(networkError); - - expect(classification.type).toBe('network'); - expect(classification.retryable).toBe(true); - expect(classification.priority).toBe('high'); + const c = TrustlineErrorRecovery.classifyError(networkError); + expect(c.type).toBe('network'); + expect(c.retryable).toBe(true); + expect(c.priority).toBe('high'); }); - test('should classify rate limit errors as retryable with low priority', () => { - const rateLimitError = new Error('rate limit exceeded'); - rateLimitError.status = 429; - const classification = TrustlineErrorRecovery.classifyError(rateLimitError); - - expect(classification.type).toBe('rate_limit'); - expect(classification.retryable).toBe(true); - expect(classification.priority).toBe('low'); + test('should classify timeout errors (isTimeout flag) as retryable', () => { + const err = new Error('operation timed out after 15000ms'); + err.isTimeout = true; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('timeout'); + expect(c.retryable).toBe(true); }); - test('should classify client errors as non-retryable', () => { - const clientError = new Error('bad request'); - clientError.status = 400; - const classification = TrustlineErrorRecovery.classifyError(clientError); - - expect(classification.type).toBe('client_error'); - expect(classification.retryable).toBe(false); + test('should classify rate limit errors (HTTP 429) as retryable with low priority', () => { + const err = new Error('rate limit exceeded'); + err.status = 429; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('rate_limit'); + expect(c.retryable).toBe(true); + expect(c.priority).toBe('low'); }); - test('should open circuit breaker after threshold failures', async () => { + test('should classify HTTP 401 as auth_error (non-retryable)', () => { + const err = new Error('unauthorized'); + err.status = 401; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('auth_error'); + expect(c.retryable).toBe(false); + }); + + test('should classify HTTP 403 as auth_error (non-retryable)', () => { + const err = new Error('forbidden'); + err.status = 403; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('auth_error'); + expect(c.retryable).toBe(false); + }); + + test('should classify client errors (4xx) as non-retryable', () => { + const err = new Error('bad request'); + err.status = 400; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('client_error'); + expect(c.retryable).toBe(false); + }); + + test('should classify 5xx server errors as retryable', () => { + const err = new Error('internal server error'); + err.status = 500; + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('server_error'); + expect(c.retryable).toBe(true); + }); + + test('should classify db schema conflict as non-retryable', () => { + const err = new Error('index already exists'); + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('db_schema_conflict'); + expect(c.retryable).toBe(false); + }); + + test('should classify unknown errors as cautiously retryable', () => { + const err = new Error('something weird happened'); + const c = TrustlineErrorRecovery.classifyError(err); + expect(c.type).toBe('unknown'); + expect(c.retryable).toBe(true); + }); + }); + + describe('TrustlineErrorRecovery – per-context circuit breakers', () => { + test('should open circuit breaker after threshold failures on a context', async () => { vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); - const mockOperation = vi.fn().mockRejectedValue(new Error('server error')); - - // Trigger multiple failures to open circuit breaker + const ctx = 'test-context-cb'; + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + // Each call exhausts MAX_RETRY_ATTEMPTS, recording one failure per call for (let i = 0; i < 5; i++) { - await expect(TrustlineErrorRecovery.executeWithRecovery(mockOperation)).rejects.toThrow(); + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), + ).rejects.toThrow(); } - - // Circuit breaker should now be open + await expect( - TrustlineErrorRecovery.executeWithRecovery(mockOperation) + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), ).rejects.toThrow('Circuit breaker is open'); vi.restoreAllMocks(); }); - test('should calculate retry delay with exponential backoff', () => { - const delay1 = TrustlineErrorRecovery.calculateRetryDelay(1, 'high'); - const delay2 = TrustlineErrorRecovery.calculateRetryDelay(2, 'high'); - const delay3 = TrustlineErrorRecovery.calculateRetryDelay(3, 'high'); - - expect(delay2).toBeGreaterThan(delay1); - expect(delay3).toBeGreaterThan(delay2); - expect(delay3).toBeLessThanOrEqual(30000); // Capped at 30 seconds + test('should isolate circuit breakers per context', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + // Open the circuit for context A + for (let i = 0; i < 5; i++) { + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, 'context-A'), + ).rejects.toThrow(); + } + + // Context B should still be operational + const successOp = vi.fn().mockResolvedValue('ok'); + const result = await TrustlineErrorRecovery.executeWithRecovery(successOp, 'context-B'); + expect(result).toBe('ok'); + + vi.restoreAllMocks(); + }); + + test('isCircuitBreakerOpen returns false when breaker is closed', () => { + expect(TrustlineErrorRecovery.isCircuitBreakerOpen('fresh-context')).toBe(false); + }); + + test('resetCircuitBreaker(context) clears only that context', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + for (let i = 0; i < 5; i++) { + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, 'ctx-reset'), + ).rejects.toThrow(); + } + + expect(TrustlineErrorRecovery.isCircuitBreakerOpen('ctx-reset')).toBe(true); + TrustlineErrorRecovery.resetCircuitBreaker('ctx-reset'); + expect(TrustlineErrorRecovery.isCircuitBreakerOpen('ctx-reset')).toBe(false); + + vi.restoreAllMocks(); + }); + + test('getCircuitBreakerMetrics returns state snapshots', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, 'metrics-ctx'), + ).rejects.toThrow(); + + const metrics = TrustlineErrorRecovery.getCircuitBreakerMetrics(); + expect(metrics['metrics-ctx']).toBeDefined(); + expect(metrics['metrics-ctx'].metrics.totalFailures).toBeGreaterThan(0); + + vi.restoreAllMocks(); + }); + }); + + describe('TrustlineErrorRecovery – half-open circuit breaker', () => { + test('should allow a probe after timeout and close on success', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const ctx = 'half-open-ctx'; + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + // Force open + for (let i = 0; i < 5; i++) { + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), + ).rejects.toThrow(); + } + + // Manually advance the state to half-open by mutating internal state + const state = TrustlineErrorRecovery._getState(ctx); + state.state = 'half-open'; + + const successOp = vi.fn().mockResolvedValue('recovered'); + const result = await TrustlineErrorRecovery.executeWithRecovery(successOp, ctx); + expect(result).toBe('recovered'); + expect(state.state).toBe('closed'); + + vi.restoreAllMocks(); + }); + + test('should reopen circuit if half-open probe fails', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const ctx = 'half-open-fail-ctx'; + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + for (let i = 0; i < 5; i++) { + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), + ).rejects.toThrow(); + } + + const state = TrustlineErrorRecovery._getState(ctx); + state.state = 'half-open'; + + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), + ).rejects.toThrow(); + + expect(state.state).toBe('open'); + + vi.restoreAllMocks(); + }); + }); + + describe('TrustlineErrorRecovery – operation timeout', () => { + test('withTimeout rejects after the specified delay', async () => { + const neverResolves = new Promise(() => {}); + await expect( + TrustlineErrorRecovery.withTimeout(neverResolves, 50, 'slow op'), + ).rejects.toThrow('slow op timed out after 50ms'); + }); + + test('withTimeout resolves if operation completes in time', async () => { + const fast = Promise.resolve('quick'); + await expect( + TrustlineErrorRecovery.withTimeout(fast, 1000, 'fast op'), + ).resolves.toBe('quick'); + }); + }); + + describe('TrustlineErrorRecovery – dead-letter queue', () => { + test('should push non-retryable failures to the DLQ', async () => { + const err = new Error('asset not found'); + err.status = 404; + const failingOp = vi.fn().mockRejectedValue(err); + + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, 'dlq-ctx'), + ).rejects.toThrow(); + + const dlq = TrustlineErrorRecovery.getDeadLetterQueue(); + expect(dlq.length).toBeGreaterThan(0); + expect(dlq[0].context).toBe('dlq-ctx'); + expect(dlq[0].errorType).toBe('asset_not_found'); + }); + + test('drainDeadLetterQueue returns all entries and empties the queue', async () => { + const err = new Error('asset not found'); + err.status = 404; + const failingOp = vi.fn().mockRejectedValue(err); + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, 'drain-ctx'), + ).rejects.toThrow(); + + const drained = TrustlineErrorRecovery.drainDeadLetterQueue(); + expect(drained.length).toBeGreaterThan(0); + expect(TrustlineErrorRecovery.getDeadLetterQueue()).toHaveLength(0); + }); + }); + + describe('TrustlineErrorRecovery – fallback handler', () => { + test('should return fallback value when all attempts fail', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const err = new Error('network error'); + const failingOp = vi.fn().mockRejectedValue(err); + const fallback = vi.fn().mockResolvedValue('cached-data'); + + const result = await TrustlineErrorRecovery.executeWithRecovery( + failingOp, + 'fallback-ctx', + { fallback }, + ); + + expect(result).toBe('cached-data'); + expect(fallback).toHaveBeenCalledTimes(1); + + vi.restoreAllMocks(); + }); + + test('should return fallback value when circuit breaker is open', async () => { + vi.spyOn(TrustlineErrorRecovery, 'sleep').mockResolvedValue(undefined); + const ctx = 'cb-fallback-ctx'; + const failingOp = vi.fn().mockRejectedValue(new Error('server error')); + + for (let i = 0; i < 5; i++) { + await expect( + TrustlineErrorRecovery.executeWithRecovery(failingOp, ctx), + ).rejects.toThrow(); + } + + const fallback = vi.fn().mockResolvedValue('degraded-response'); + const result = await TrustlineErrorRecovery.executeWithRecovery( + failingOp, + ctx, + { fallback }, + ); + + expect(result).toBe('degraded-response'); + expect(fallback).toHaveBeenCalledTimes(1); + + vi.restoreAllMocks(); + }); + }); + + describe('TrustlineErrorRecovery – retry delay', () => { + test('should produce strictly increasing delays with exponential backoff', () => { + // Use a fixed seed by mocking Math.random to return 0 (no jitter) + vi.spyOn(Math, 'random').mockReturnValue(0.5); // jitter = 0 + const d1 = TrustlineErrorRecovery.calculateRetryDelay(1, 'high'); + const d2 = TrustlineErrorRecovery.calculateRetryDelay(2, 'high'); + const d3 = TrustlineErrorRecovery.calculateRetryDelay(3, 'high'); + + expect(d2).toBeGreaterThan(d1); + expect(d3).toBeGreaterThan(d2); + expect(d3).toBeLessThanOrEqual(30000); + + vi.restoreAllMocks(); }); }); });