From b848035ddd4d21a1baaad9c81ebc171c731ca5ab Mon Sep 17 00:00:00 2001 From: Pavel K Date: Mon, 8 Jun 2026 00:31:14 +0200 Subject: [PATCH] Add /healthz and /readyz health probes for Kubernetes and load balancers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two unauthenticated HTTP endpoints intended for orchestrators and load balancers: - /healthz (liveness): always returns 200 with {"status": "ok"} as long as the process can serve a request. Deliberately does NOT touch the database — a database outage must not cause Kubernetes to restart the pod, because a restart would not help. - /readyz (readiness): runs SELECT 1 against the configured database and verifies that all on-disk migrations have been applied. Returns 200 with {"status": "ready", ...} on success, or 503 with a structured payload when the database is unreachable or pending migrations exist, so the load balancer drops the pod from rotation until the process is fully usable. Both endpoints live outside the /api/ namespace, so the existing api_auth_required_for_path() helper bypasses authentication without any middleware change. The before_request hook is taught to skip its implicit db.connect() for the two probe paths: /healthz must remain answerable when the DB is down, and /readyz manages its own connection explicitly so it can return a clean 503 on DB errors. Tests (tests/test_health.py, 10 cases) cover: - /healthz returns 200 even when init_database is patched to raise; - /healthz needs no auth; - /readyz returns 200 on the happy path (DB ok, no pending migrations); - /readyz returns 503 when SELECT 1 raises (DB outage); - /readyz returns 503 when a fake pending migration appears on disk; - /readyz returns 503 when the migration check itself raises; - both probes are outside /api/* (regression guard). All 10 new tests pass locally. --- app/__init__.py | 15 +++- app/views/health_view.py | 114 +++++++++++++++++++++++++ tests/test_health.py | 176 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 302 insertions(+), 3 deletions(-) create mode 100644 app/views/health_view.py create mode 100644 tests/test_health.py diff --git a/app/__init__.py b/app/__init__.py index ca9db35..5bfbd8f 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,4 @@ -from flask import Flask +from flask import Flask, request from peewee import DoesNotExist, IntegrityError from app.settings import Config @@ -13,6 +13,7 @@ from app.views.escalation_policies_view import escalation_policies_bp from app.views.docs_view import docs_bp from app.views.groups_view import groups_bp +from app.views.health_view import HEALTH_PATHS, health_bp from app.views.profile_view import profile_bp from app.views.integrations_view import integrations_bp from app.views.pages_view import pages_bp @@ -50,10 +51,17 @@ def create_app(log_role=None): def before_request(): """ Open a database connection before each request. + + Health probes (/healthz, /readyz) bypass the implicit DB + connect: /healthz must work even when the database is down + (otherwise Kubernetes would restart a healthy pod for no + reason), and /readyz manages its own connection explicitly so + it can return a clean 503 on DB errors. """ - if db.is_closed(): - db.connect() + if request.path not in HEALTH_PATHS: + if db.is_closed(): + db.connect() auth_response = enforce_api_authentication() if auth_response is not None: @@ -79,6 +87,7 @@ def register_blueprints(flask_app): flask_app.register_blueprint(pages_bp) flask_app.register_blueprint(docs_bp) + flask_app.register_blueprint(health_bp) flask_app.register_blueprint(version_bp, url_prefix="/api/version") flask_app.register_blueprint(auth_bp, url_prefix="/api/auth") flask_app.register_blueprint(sso_auth_bp, url_prefix="/api/auth/sso") diff --git a/app/views/health_view.py b/app/views/health_view.py new file mode 100644 index 0000000..25d18b9 --- /dev/null +++ b/app/views/health_view.py @@ -0,0 +1,114 @@ +""" +Health probes for liveness and readiness checks. + +The probes are deliberately unauthenticated and live outside the /api/ +namespace so Kubernetes, HAProxy, nginx, AWS ELB and other load +balancers can poll them without credentials. + +Conventions follow the Kubernetes naming style: + +- /healthz: liveness. Returns 200 as long as the process can serve a + request. Does NOT touch the database — a database outage must not + cause Kubernetes to restart the pod, since a restart would not help. + +- /readyz: readiness. Returns 200 only when the database is reachable + and all on-disk migrations have been applied. Otherwise returns 503 + so the load balancer stops routing traffic until the process is + fully usable. +""" + +import logging + +from flask import Blueprint, jsonify + +from app.db import init_database +from app.modules.db.migrations import ( + get_applied_migrations, + get_migration_files, +) + + +logger = logging.getLogger("oncall.health") + +health_bp = Blueprint("health", __name__) + + +HEALTH_PATHS = ("/healthz", "/readyz") + + +@health_bp.route("/healthz", methods=["GET"]) +def healthz(): + """ + Liveness probe. Returns 200 as long as the process can respond. + Intentionally does not touch the database or any other dependency. + """ + return jsonify({"status": "ok"}), 200 + + +@health_bp.route("/readyz", methods=["GET"]) +def readyz(): + """ + Readiness probe. Returns 200 only when: + + - the database connection can be opened and answers SELECT 1; + - the number of applied migrations matches the number of migration + files on disk (no pending migrations). + + Otherwise returns 503 with a structured payload describing what + is not ready. + """ + db = init_database() + response = {"status": "ready"} + http_status = 200 + + # Database check ------------------------------------------------------ + db_was_closed = db.is_closed() + try: + if db_was_closed: + db.connect(reuse_if_open=True) + db.execute_sql("SELECT 1") + response["database"] = "ok" + except Exception as exc: + response["status"] = "not_ready" + response["database"] = "error" + response["database_error"] = str(exc) + http_status = 503 + logger.warning("readyz database check failed: %s", exc) + # If the database is unreachable, there is no point in checking + # migrations; return early. + return jsonify(response), http_status + finally: + if db_was_closed and not db.is_closed(): + try: + db.close() + except Exception: + pass + + # Migrations check ---------------------------------------------------- + # Migration table stores names WITHOUT the .py suffix; on-disk files + # carry it. Normalize the disk side so set comparison works (this + # matches how migrations.migrate() itself compares them). + try: + applied = set(get_applied_migrations()) + on_disk = [ + filename.replace(".py", "") + for filename in get_migration_files() + ] + pending = [name for name in on_disk if name not in applied] + + response["migrations"] = { + "applied": len(applied), + "total": len(on_disk), + } + + if pending: + response["status"] = "not_ready" + response["migrations"]["pending"] = pending + http_status = 503 + except Exception as exc: + response["status"] = "not_ready" + response["migrations"] = {"error": str(exc)} + http_status = 503 + logger.warning("readyz migration check failed: %s", exc) + + return jsonify(response), http_status diff --git a/tests/test_health.py b/tests/test_health.py new file mode 100644 index 0000000..1eedf0c --- /dev/null +++ b/tests/test_health.py @@ -0,0 +1,176 @@ +""" +Tests for liveness (/healthz) and readiness (/readyz) probes. + +These endpoints exist outside the /api/ namespace so they are reachable +without authentication (Kubernetes/HAProxy/ELB probes can't carry +credentials). Behaviour is asserted directly here so regressions don't +silently break load balancer integration. +""" + +from unittest.mock import patch + +import pytest + + +# --------------------------------------------------------------------------- +# /healthz: liveness +# --------------------------------------------------------------------------- + + +def test_healthz_returns_200_and_ok_status(client): + """/healthz returns 200 with {"status": "ok"} when the process is alive.""" + response = client.get("/healthz") + + assert response.status_code == 200 + assert response.get_json() == {"status": "ok"} + + +def test_healthz_does_not_require_authentication(client): + """/healthz must work without any auth — load balancers can't carry creds.""" + # No Authorization header, no cookie. + response = client.get("/healthz") + + assert response.status_code == 200 + + +def test_healthz_returns_200_even_when_database_is_unreachable(client): + """ + Liveness must NOT depend on the database. A DB outage should not cause + Kubernetes to restart the pod, since restarting would not help. + """ + with patch( + "app.views.health_view.init_database", + side_effect=RuntimeError("simulated DB outage"), + ): + response = client.get("/healthz") + + # /healthz never even calls init_database, but the patch above guarantees + # that if a future refactor accidentally introduced a DB call, this test + # would catch it. + assert response.status_code == 200 + assert response.get_json() == {"status": "ok"} + + +# --------------------------------------------------------------------------- +# /readyz: readiness +# --------------------------------------------------------------------------- + + +def test_readyz_returns_200_when_database_and_migrations_are_ok(client): + """ + Happy path: tests/conftest.py applies all migrations before the test, so + /readyz should report database=ok, no pending migrations and status=ready. + """ + response = client.get("/readyz") + + assert response.status_code == 200 + + body = response.get_json() + assert body["status"] == "ready" + assert body["database"] == "ok" + assert "pending" not in body["migrations"] + assert body["migrations"]["total"] >= 1 + assert body["migrations"]["applied"] >= body["migrations"]["total"] + + +def test_readyz_does_not_require_authentication(client): + """/readyz must be reachable without credentials too.""" + response = client.get("/readyz") + + # Either 200 (ready) or 503 (not ready), but NOT 401/403 — auth is bypassed. + assert response.status_code in (200, 503) + + +def test_readyz_returns_503_when_database_select_fails(client): + """ + When SELECT 1 raises, readyz reports database=error, status=not_ready + and HTTP 503 — so the load balancer drops this pod from the rotation. + """ + def boom(*args, **kwargs): + raise RuntimeError("connection refused") + + # We patch execute_sql on whatever instance init_database hands back, + # via the database_proxy module-level singleton. + from app.db import database_proxy + + with patch.object(database_proxy.obj, "execute_sql", side_effect=boom): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "error" + assert "connection refused" in body["database_error"] + # Migration check is skipped when the DB is down — it would be pointless. + assert "migrations" not in body + + +def test_readyz_returns_503_when_pending_migrations_exist(client): + """ + When on-disk migration files exist that are not yet applied, /readyz + reports them in body.migrations.pending and returns 503. + """ + # Pretend there's a new migration file on disk that hasn't been applied. + fake_new_file = "29990101000000_simulated_pending_migration.py" + + real_files = None + from app.views import health_view + real_files = health_view.get_migration_files() + + with patch( + "app.views.health_view.get_migration_files", + return_value=list(real_files) + [fake_new_file], + ): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "ok" + + pending = body["migrations"].get("pending") or [] + assert any(name.startswith("29990101000000") for name in pending) + + +def test_readyz_returns_503_when_migration_check_raises(client): + """ + If the migration check itself blows up (e.g. file IO error), /readyz + still surfaces the failure as 503 with a structured error instead of + crashing the request handler. + """ + with patch( + "app.views.health_view.get_migration_files", + side_effect=OSError("disk gone"), + ): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "ok" + assert "error" in body["migrations"] + assert "disk gone" in body["migrations"]["error"] + + +# --------------------------------------------------------------------------- +# Cross-endpoint invariants +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("path", ["/healthz", "/readyz"]) +def test_probes_are_outside_api_namespace(path, client): + """ + Probes deliberately live outside /api/* so that the API authentication + middleware (enforce_api_authentication) doesn't gate them. Make sure no + refactor accidentally moves them under /api/. + """ + assert not path.startswith("/api/") + + response = client.get(path) + # The status itself is asserted elsewhere; here we just confirm we got a + # JSON response from the probe blueprint (not an auth redirect or a 404). + assert response.status_code in (200, 503) + assert response.is_json