Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from flask import Flask
from flask import Flask, request
from peewee import DoesNotExist, IntegrityError

from app.settings import Config
Expand All @@ -13,6 +13,7 @@
from app.views.escalation_policies_view import escalation_policies_bp
from app.views.docs_view import docs_bp
from app.views.groups_view import groups_bp
from app.views.health_view import HEALTH_PATHS, health_bp
from app.views.profile_view import profile_bp
from app.views.integrations_view import integrations_bp
from app.views.pages_view import pages_bp
Expand Down Expand Up @@ -50,10 +51,17 @@ def create_app(log_role=None):
def before_request():
"""
Open a database connection before each request.

Health probes (/healthz, /readyz) bypass the implicit DB
connect: /healthz must work even when the database is down
(otherwise Kubernetes would restart a healthy pod for no
reason), and /readyz manages its own connection explicitly so
it can return a clean 503 on DB errors.
"""

if db.is_closed():
db.connect()
if request.path not in HEALTH_PATHS:
if db.is_closed():
db.connect()

auth_response = enforce_api_authentication()
if auth_response is not None:
Expand All @@ -79,6 +87,7 @@ def register_blueprints(flask_app):

flask_app.register_blueprint(pages_bp)
flask_app.register_blueprint(docs_bp)
flask_app.register_blueprint(health_bp)
flask_app.register_blueprint(version_bp, url_prefix="/api/version")
flask_app.register_blueprint(auth_bp, url_prefix="/api/auth")
flask_app.register_blueprint(sso_auth_bp, url_prefix="/api/auth/sso")
Expand Down
114 changes: 114 additions & 0 deletions app/views/health_view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Health probes for liveness and readiness checks.

The probes are deliberately unauthenticated and live outside the /api/
namespace so Kubernetes, HAProxy, nginx, AWS ELB and other load
balancers can poll them without credentials.

Conventions follow the Kubernetes naming style:

- /healthz: liveness. Returns 200 as long as the process can serve a
request. Does NOT touch the database — a database outage must not
cause Kubernetes to restart the pod, since a restart would not help.

- /readyz: readiness. Returns 200 only when the database is reachable
and all on-disk migrations have been applied. Otherwise returns 503
so the load balancer stops routing traffic until the process is
fully usable.
"""

import logging

from flask import Blueprint, jsonify

from app.db import init_database
from app.modules.db.migrations import (
get_applied_migrations,
get_migration_files,
)


logger = logging.getLogger("oncall.health")

health_bp = Blueprint("health", __name__)


HEALTH_PATHS = ("/healthz", "/readyz")


@health_bp.route("/healthz", methods=["GET"])
def healthz():
"""
Liveness probe. Returns 200 as long as the process can respond.
Intentionally does not touch the database or any other dependency.
"""
return jsonify({"status": "ok"}), 200


@health_bp.route("/readyz", methods=["GET"])
def readyz():
"""
Readiness probe. Returns 200 only when:

- the database connection can be opened and answers SELECT 1;
- the number of applied migrations matches the number of migration
files on disk (no pending migrations).

Otherwise returns 503 with a structured payload describing what
is not ready.
"""
db = init_database()
response = {"status": "ready"}
http_status = 200

# Database check ------------------------------------------------------
db_was_closed = db.is_closed()
try:
if db_was_closed:
db.connect(reuse_if_open=True)
db.execute_sql("SELECT 1")
response["database"] = "ok"
except Exception as exc:
response["status"] = "not_ready"
response["database"] = "error"
response["database_error"] = str(exc)
http_status = 503
logger.warning("readyz database check failed: %s", exc)
# If the database is unreachable, there is no point in checking
# migrations; return early.
return jsonify(response), http_status
finally:
if db_was_closed and not db.is_closed():
try:
db.close()
except Exception:
pass

# Migrations check ----------------------------------------------------
# Migration table stores names WITHOUT the .py suffix; on-disk files
# carry it. Normalize the disk side so set comparison works (this
# matches how migrations.migrate() itself compares them).
try:
applied = set(get_applied_migrations())
on_disk = [
filename.replace(".py", "")
for filename in get_migration_files()
]
pending = [name for name in on_disk if name not in applied]

response["migrations"] = {
"applied": len(applied),
"total": len(on_disk),
}

if pending:
response["status"] = "not_ready"
response["migrations"]["pending"] = pending
http_status = 503
except Exception as exc:
response["status"] = "not_ready"
response["migrations"] = {"error": str(exc)}
http_status = 503
logger.warning("readyz migration check failed: %s", exc)

return jsonify(response), http_status
176 changes: 176 additions & 0 deletions tests/test_health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""
Tests for liveness (/healthz) and readiness (/readyz) probes.

These endpoints exist outside the /api/ namespace so they are reachable
without authentication (Kubernetes/HAProxy/ELB probes can't carry
credentials). Behaviour is asserted directly here so regressions don't
silently break load balancer integration.
"""

from unittest.mock import patch

import pytest


# ---------------------------------------------------------------------------
# /healthz: liveness
# ---------------------------------------------------------------------------


def test_healthz_returns_200_and_ok_status(client):
"""/healthz returns 200 with {"status": "ok"} when the process is alive."""
response = client.get("/healthz")

assert response.status_code == 200
assert response.get_json() == {"status": "ok"}


def test_healthz_does_not_require_authentication(client):
"""/healthz must work without any auth — load balancers can't carry creds."""
# No Authorization header, no cookie.
response = client.get("/healthz")

assert response.status_code == 200


def test_healthz_returns_200_even_when_database_is_unreachable(client):
"""
Liveness must NOT depend on the database. A DB outage should not cause
Kubernetes to restart the pod, since restarting would not help.
"""
with patch(
"app.views.health_view.init_database",
side_effect=RuntimeError("simulated DB outage"),
):
response = client.get("/healthz")

# /healthz never even calls init_database, but the patch above guarantees
# that if a future refactor accidentally introduced a DB call, this test
# would catch it.
assert response.status_code == 200
assert response.get_json() == {"status": "ok"}


# ---------------------------------------------------------------------------
# /readyz: readiness
# ---------------------------------------------------------------------------


def test_readyz_returns_200_when_database_and_migrations_are_ok(client):
"""
Happy path: tests/conftest.py applies all migrations before the test, so
/readyz should report database=ok, no pending migrations and status=ready.
"""
response = client.get("/readyz")

assert response.status_code == 200

body = response.get_json()
assert body["status"] == "ready"
assert body["database"] == "ok"
assert "pending" not in body["migrations"]
assert body["migrations"]["total"] >= 1
assert body["migrations"]["applied"] >= body["migrations"]["total"]


def test_readyz_does_not_require_authentication(client):
"""/readyz must be reachable without credentials too."""
response = client.get("/readyz")

# Either 200 (ready) or 503 (not ready), but NOT 401/403 — auth is bypassed.
assert response.status_code in (200, 503)


def test_readyz_returns_503_when_database_select_fails(client):
"""
When SELECT 1 raises, readyz reports database=error, status=not_ready
and HTTP 503 — so the load balancer drops this pod from the rotation.
"""
def boom(*args, **kwargs):
raise RuntimeError("connection refused")

# We patch execute_sql on whatever instance init_database hands back,
# via the database_proxy module-level singleton.
from app.db import database_proxy

with patch.object(database_proxy.obj, "execute_sql", side_effect=boom):
response = client.get("/readyz")

assert response.status_code == 503

body = response.get_json()
assert body["status"] == "not_ready"
assert body["database"] == "error"
assert "connection refused" in body["database_error"]
# Migration check is skipped when the DB is down — it would be pointless.
assert "migrations" not in body


def test_readyz_returns_503_when_pending_migrations_exist(client):
"""
When on-disk migration files exist that are not yet applied, /readyz
reports them in body.migrations.pending and returns 503.
"""
# Pretend there's a new migration file on disk that hasn't been applied.
fake_new_file = "29990101000000_simulated_pending_migration.py"

real_files = None
from app.views import health_view
real_files = health_view.get_migration_files()

with patch(
"app.views.health_view.get_migration_files",
return_value=list(real_files) + [fake_new_file],
):
response = client.get("/readyz")

assert response.status_code == 503

body = response.get_json()
assert body["status"] == "not_ready"
assert body["database"] == "ok"

pending = body["migrations"].get("pending") or []
assert any(name.startswith("29990101000000") for name in pending)


def test_readyz_returns_503_when_migration_check_raises(client):
"""
If the migration check itself blows up (e.g. file IO error), /readyz
still surfaces the failure as 503 with a structured error instead of
crashing the request handler.
"""
with patch(
"app.views.health_view.get_migration_files",
side_effect=OSError("disk gone"),
):
response = client.get("/readyz")

assert response.status_code == 503

body = response.get_json()
assert body["status"] == "not_ready"
assert body["database"] == "ok"
assert "error" in body["migrations"]
assert "disk gone" in body["migrations"]["error"]


# ---------------------------------------------------------------------------
# Cross-endpoint invariants
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("path", ["/healthz", "/readyz"])
def test_probes_are_outside_api_namespace(path, client):
"""
Probes deliberately live outside /api/* so that the API authentication
middleware (enforce_api_authentication) doesn't gate them. Make sure no
refactor accidentally moves them under /api/.
"""
assert not path.startswith("/api/")

response = client.get(path)
# The status itself is asserted elsewhere; here we just confirm we got a
# JSON response from the probe blueprint (not an auth redirect or a 404).
assert response.status_code in (200, 503)
assert response.is_json