diff --git a/app/__init__.py b/app/__init__.py index ca9db35..5bfbd8f 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,4 @@ -from flask import Flask +from flask import Flask, request from peewee import DoesNotExist, IntegrityError from app.settings import Config @@ -13,6 +13,7 @@ from app.views.escalation_policies_view import escalation_policies_bp from app.views.docs_view import docs_bp from app.views.groups_view import groups_bp +from app.views.health_view import HEALTH_PATHS, health_bp from app.views.profile_view import profile_bp from app.views.integrations_view import integrations_bp from app.views.pages_view import pages_bp @@ -50,10 +51,17 @@ def create_app(log_role=None): def before_request(): """ Open a database connection before each request. + + Health probes (/healthz, /readyz) bypass the implicit DB + connect: /healthz must work even when the database is down + (otherwise Kubernetes would restart a healthy pod for no + reason), and /readyz manages its own connection explicitly so + it can return a clean 503 on DB errors. """ - if db.is_closed(): - db.connect() + if request.path not in HEALTH_PATHS: + if db.is_closed(): + db.connect() auth_response = enforce_api_authentication() if auth_response is not None: @@ -79,6 +87,7 @@ def register_blueprints(flask_app): flask_app.register_blueprint(pages_bp) flask_app.register_blueprint(docs_bp) + flask_app.register_blueprint(health_bp) flask_app.register_blueprint(version_bp, url_prefix="/api/version") flask_app.register_blueprint(auth_bp, url_prefix="/api/auth") flask_app.register_blueprint(sso_auth_bp, url_prefix="/api/auth/sso") diff --git a/app/views/health_view.py b/app/views/health_view.py new file mode 100644 index 0000000..25d18b9 --- /dev/null +++ b/app/views/health_view.py @@ -0,0 +1,114 @@ +""" +Health probes for liveness and readiness checks. + +The probes are deliberately unauthenticated and live outside the /api/ +namespace so Kubernetes, HAProxy, nginx, AWS ELB and other load +balancers can poll them without credentials. + +Conventions follow the Kubernetes naming style: + +- /healthz: liveness. Returns 200 as long as the process can serve a + request. Does NOT touch the database — a database outage must not + cause Kubernetes to restart the pod, since a restart would not help. + +- /readyz: readiness. Returns 200 only when the database is reachable + and all on-disk migrations have been applied. Otherwise returns 503 + so the load balancer stops routing traffic until the process is + fully usable. +""" + +import logging + +from flask import Blueprint, jsonify + +from app.db import init_database +from app.modules.db.migrations import ( + get_applied_migrations, + get_migration_files, +) + + +logger = logging.getLogger("oncall.health") + +health_bp = Blueprint("health", __name__) + + +HEALTH_PATHS = ("/healthz", "/readyz") + + +@health_bp.route("/healthz", methods=["GET"]) +def healthz(): + """ + Liveness probe. Returns 200 as long as the process can respond. + Intentionally does not touch the database or any other dependency. + """ + return jsonify({"status": "ok"}), 200 + + +@health_bp.route("/readyz", methods=["GET"]) +def readyz(): + """ + Readiness probe. Returns 200 only when: + + - the database connection can be opened and answers SELECT 1; + - the number of applied migrations matches the number of migration + files on disk (no pending migrations). + + Otherwise returns 503 with a structured payload describing what + is not ready. + """ + db = init_database() + response = {"status": "ready"} + http_status = 200 + + # Database check ------------------------------------------------------ + db_was_closed = db.is_closed() + try: + if db_was_closed: + db.connect(reuse_if_open=True) + db.execute_sql("SELECT 1") + response["database"] = "ok" + except Exception as exc: + response["status"] = "not_ready" + response["database"] = "error" + response["database_error"] = str(exc) + http_status = 503 + logger.warning("readyz database check failed: %s", exc) + # If the database is unreachable, there is no point in checking + # migrations; return early. + return jsonify(response), http_status + finally: + if db_was_closed and not db.is_closed(): + try: + db.close() + except Exception: + pass + + # Migrations check ---------------------------------------------------- + # Migration table stores names WITHOUT the .py suffix; on-disk files + # carry it. Normalize the disk side so set comparison works (this + # matches how migrations.migrate() itself compares them). + try: + applied = set(get_applied_migrations()) + on_disk = [ + filename.replace(".py", "") + for filename in get_migration_files() + ] + pending = [name for name in on_disk if name not in applied] + + response["migrations"] = { + "applied": len(applied), + "total": len(on_disk), + } + + if pending: + response["status"] = "not_ready" + response["migrations"]["pending"] = pending + http_status = 503 + except Exception as exc: + response["status"] = "not_ready" + response["migrations"] = {"error": str(exc)} + http_status = 503 + logger.warning("readyz migration check failed: %s", exc) + + return jsonify(response), http_status diff --git a/tests/test_health.py b/tests/test_health.py new file mode 100644 index 0000000..1eedf0c --- /dev/null +++ b/tests/test_health.py @@ -0,0 +1,176 @@ +""" +Tests for liveness (/healthz) and readiness (/readyz) probes. + +These endpoints exist outside the /api/ namespace so they are reachable +without authentication (Kubernetes/HAProxy/ELB probes can't carry +credentials). Behaviour is asserted directly here so regressions don't +silently break load balancer integration. +""" + +from unittest.mock import patch + +import pytest + + +# --------------------------------------------------------------------------- +# /healthz: liveness +# --------------------------------------------------------------------------- + + +def test_healthz_returns_200_and_ok_status(client): + """/healthz returns 200 with {"status": "ok"} when the process is alive.""" + response = client.get("/healthz") + + assert response.status_code == 200 + assert response.get_json() == {"status": "ok"} + + +def test_healthz_does_not_require_authentication(client): + """/healthz must work without any auth — load balancers can't carry creds.""" + # No Authorization header, no cookie. + response = client.get("/healthz") + + assert response.status_code == 200 + + +def test_healthz_returns_200_even_when_database_is_unreachable(client): + """ + Liveness must NOT depend on the database. A DB outage should not cause + Kubernetes to restart the pod, since restarting would not help. + """ + with patch( + "app.views.health_view.init_database", + side_effect=RuntimeError("simulated DB outage"), + ): + response = client.get("/healthz") + + # /healthz never even calls init_database, but the patch above guarantees + # that if a future refactor accidentally introduced a DB call, this test + # would catch it. + assert response.status_code == 200 + assert response.get_json() == {"status": "ok"} + + +# --------------------------------------------------------------------------- +# /readyz: readiness +# --------------------------------------------------------------------------- + + +def test_readyz_returns_200_when_database_and_migrations_are_ok(client): + """ + Happy path: tests/conftest.py applies all migrations before the test, so + /readyz should report database=ok, no pending migrations and status=ready. + """ + response = client.get("/readyz") + + assert response.status_code == 200 + + body = response.get_json() + assert body["status"] == "ready" + assert body["database"] == "ok" + assert "pending" not in body["migrations"] + assert body["migrations"]["total"] >= 1 + assert body["migrations"]["applied"] >= body["migrations"]["total"] + + +def test_readyz_does_not_require_authentication(client): + """/readyz must be reachable without credentials too.""" + response = client.get("/readyz") + + # Either 200 (ready) or 503 (not ready), but NOT 401/403 — auth is bypassed. + assert response.status_code in (200, 503) + + +def test_readyz_returns_503_when_database_select_fails(client): + """ + When SELECT 1 raises, readyz reports database=error, status=not_ready + and HTTP 503 — so the load balancer drops this pod from the rotation. + """ + def boom(*args, **kwargs): + raise RuntimeError("connection refused") + + # We patch execute_sql on whatever instance init_database hands back, + # via the database_proxy module-level singleton. + from app.db import database_proxy + + with patch.object(database_proxy.obj, "execute_sql", side_effect=boom): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "error" + assert "connection refused" in body["database_error"] + # Migration check is skipped when the DB is down — it would be pointless. + assert "migrations" not in body + + +def test_readyz_returns_503_when_pending_migrations_exist(client): + """ + When on-disk migration files exist that are not yet applied, /readyz + reports them in body.migrations.pending and returns 503. + """ + # Pretend there's a new migration file on disk that hasn't been applied. + fake_new_file = "29990101000000_simulated_pending_migration.py" + + real_files = None + from app.views import health_view + real_files = health_view.get_migration_files() + + with patch( + "app.views.health_view.get_migration_files", + return_value=list(real_files) + [fake_new_file], + ): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "ok" + + pending = body["migrations"].get("pending") or [] + assert any(name.startswith("29990101000000") for name in pending) + + +def test_readyz_returns_503_when_migration_check_raises(client): + """ + If the migration check itself blows up (e.g. file IO error), /readyz + still surfaces the failure as 503 with a structured error instead of + crashing the request handler. + """ + with patch( + "app.views.health_view.get_migration_files", + side_effect=OSError("disk gone"), + ): + response = client.get("/readyz") + + assert response.status_code == 503 + + body = response.get_json() + assert body["status"] == "not_ready" + assert body["database"] == "ok" + assert "error" in body["migrations"] + assert "disk gone" in body["migrations"]["error"] + + +# --------------------------------------------------------------------------- +# Cross-endpoint invariants +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("path", ["/healthz", "/readyz"]) +def test_probes_are_outside_api_namespace(path, client): + """ + Probes deliberately live outside /api/* so that the API authentication + middleware (enforce_api_authentication) doesn't gate them. Make sure no + refactor accidentally moves them under /api/. + """ + assert not path.startswith("/api/") + + response = client.get(path) + # The status itself is asserted elsewhere; here we just confirm we got a + # JSON response from the probe blueprint (not an auth redirect or a 404). + assert response.status_code in (200, 503) + assert response.is_json